In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *
import json
import requests
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from pymongo import MongoClient
import re
import csv
import ast

#Data Preprocessing

In [6]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [7]:
client = MongoClient("mongodb://root:password@localhost:27017/")

db = client["anime"]
collection = db["animelist"]

In [9]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/rating_complete.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id","anime_id","rating"]
                        )
anime_data = pd.read_csv(INPUT_DIR + '/anime.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["anime_id","name","score","genres","english_name","japanese_name","type","episodes","aired","premiered","producers","licensors","studios","source","duration","ratings","ranked"
                                 ,"popularity","members","favorites","watching","completed","on_hold","dropped","plan_to_watch"]
                        )
anime_synopsis = pd.read_csv(INPUT_DIR + '/anime_with_synopsis.csv',
                             low_memory=False,
                             decimal=',',
                             usecols=["anime_id", "name", "synopsis"],
                             encoding='latin1'
                             )

In [10]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,name,score,genres,english_name,japanese_name,type,episodes,aired,premiered,...,ratings,ranked,popularity,members,favorites,watching,completed,on_hold,dropped,plan_to_watch
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,R - 17+ (violence & profanity),28,39,1251960,61971,105808,718161,71513,26678,329800
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,R - 17+ (violence & profanity),159,518,273145,1174,4143,208333,1935,770,57964
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,PG-13 - Teens 13 or older,266,201,558913,12944,29113,343492,25465,13925,146918


In [11]:
from sklearn.model_selection import train_test_split

# (60% train, 40% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.6, random_state=42)

# (50% train, 50% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

In [12]:
# anime_ratings
len(anime_ratings)

23053311

In [13]:
# train_ratings
len(train_ratings)

17289983

In [14]:
# test_ratings
len(test_ratings)

17289984

In [15]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
40273468,247449,18099,8
24842538,152911,1047,4
38953564,239265,14227,7


In [16]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

749513

In [17]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

747721

In [18]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()
test_ratings = test_ratings.drop_duplicates()

In [19]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
50579508,310065,32900,7
1733703,10851,19023,5
16250982,99690,819,4


In [20]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix_train = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,5.0,,8.0,,,,...,,9.0,,,,,,,,
5,,,8.0,,,,,,,8.0,...,,,,8.0,,,9.0,,,
6,,,,,,,,8.0,,,...,,,,5.0,,,,,,
7,,,8.0,,,7.0,,7.0,,,...,,,,,,,,,,
8,,,,,,,,6.0,,,...,,,,,,,,,,


In [21]:
# NaN преобразовываю в нули
user_item_matrix_train.fillna(0, inplace = True)
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Размерность матрицы "пользователи х anime"
user_item_matrix_train.shape

(16422, 1009)

In [23]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data_train = csr_matrix(user_item_matrix_train.values)

In [24]:
user_item_matrix_train.head()

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Сброшу индекс с помощью reset_index()
user_item_matrix_train = user_item_matrix_train.rename_axis(None, axis = 1).reset_index()
user_item_matrix_train.head()

Unnamed: 0,anime_id,781,890,1177,1397,1469,1946,3578,4773,5045,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
0,1,0.0,0.0,0.0,0.0,5.0,0.0,8.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Item Based

In [26]:
def get_recommendations(search_words, n_recommendations=10):
    recommendations = []  # Создание пустого списка для рекомендаций

    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)  # Инициализация модели k-ближайших соседей с параметрами: метрика - косинусное расстояние, алгоритм - brute force, количество соседей - 20, количество параллельных задач - все доступные ядра процессора
    knn.fit(csr_data_train)  # Обучение модели на тренировочных данных

    for word in search_words:  # Перебор по каждому слову в списке поисковых слов
        anime_search = anime_data[anime_data['name'].str.contains(word)]  # Поиск аниме, содержащего указанное слово в названии
        anime_id = anime_search.iloc[0]['anime_id']  # Получение идентификатора найденного аниме
        anime_id = int(user_item_matrix_train[user_item_matrix_train['anime_id'] == anime_id].index[0])  # Получение индекса аниме в user_item_matrix_train по его идентификатору

        distances, indices = knn.kneighbors(csr_data_train[anime_id], n_neighbors=n_recommendations + 1)  # Получение ближайших соседей для указанного аниме
        indices_list = indices.squeeze().tolist()  # Преобразование индексов соседей в список
        distances_list = distances.squeeze().tolist()  # Преобразование расстояний до соседей в список
        indices_distances = list(zip(indices_list, distances_list))  # Объединение индексов и расстояний в список кортежей
        indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)  # Сортировка списка по расстоянию в порядке возрастания
        indices_distances_sorted = indices_distances_sorted[1:]  # Исключение первого элемента, так как он соответствует исходному аниме

        for ind_dist in indices_distances_sorted:  # Перебор по каждому соседу
            anime_id = int(user_item_matrix_train.iloc[ind_dist[0]]['anime_id'])  # Получение идентификатора аниме по индексу
            anime_name = anime_data[anime_data['anime_id'] == anime_id]['name'].iloc[0]  # Получение названия аниме по его идентификатору
            recommendations.append({'anime_id': anime_id, 'name': anime_name})  # Добавление рекомендации в список рекомендаций

    return {'recommendations': recommendations}  # Возврат словаря с результатом рекомендаций

In [27]:
print(get_recommendations(['Naruto', 'Bleach'], 3))

{'recommendations': [{'anime_id': 4437, 'name': 'Naruto: Shippuuden Movie 2 - Kizuna'}, {'anime_id': 5114, 'name': 'Fullmetal Alchemist: Brotherhood'}, {'anime_id': 10863, 'name': 'Steins;Gate: Oukoubakko no Poriomania'}, {'anime_id': 834, 'name': 'Bleach: The Sealed Sword Frenzy'}, {'anime_id': 18115, 'name': 'Magi: The Kingdom of Magic'}, {'anime_id': 19815, 'name': 'No Game No Life'}]}


# User Based

In [28]:
# Создание матрицы пользователь-аниме
user_anime_matrix = csr_matrix((train_ratings['rating'],
                                (train_ratings['user_id'], train_ratings['anime_id'])))

# Создание модели NearestNeighbors
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(user_anime_matrix)

In [29]:
def get_user_based_recommendations(user_id, n_recommendations=5):
    # Найти индексы наиболее похожих пользователей
    _, indices = model.kneighbors(user_anime_matrix[user_id], n_neighbors=n_recommendations)
    similar_users = indices.flatten()

    # Получить список аниме, оцененных этими похожими пользователями
    similar_anime = train_ratings[train_ratings['user_id'].isin(similar_users)]['anime_id'].unique()

    # Исключить аниме, которые уже оценил выбранный пользователь
    user_rated_anime = train_ratings[train_ratings['user_id'] == user_id]['anime_id'].unique()
    recommended_anime = [anime_id for anime_id in similar_anime if anime_id not in user_rated_anime]

    # Получить названия и идентификаторы рекомендуемого аниме
    recommended_anime_data = anime_data[anime_data['anime_id'].isin(recommended_anime)][['anime_id', 'name']]
    recommended_anime_list = recommended_anime_data.head(n_recommendations).to_dict('records')

    return {'recommendations': recommended_anime_list}


In [30]:
train_ratings.head(2)

Unnamed: 0,user_id,anime_id,rating
50579508,310065,32900,7
1733703,10851,19023,5


In [31]:
# Получение рекомендаций для пользователя
print(get_user_based_recommendations(310065, 2))


{'recommendations': [{'anime_id': 6, 'name': 'Trigun'}, {'anime_id': 18, 'name': 'Initial D Fourth Stage'}]}


# Content Based

In [32]:
def get_content_recommendations(search_words, n_recommendations=10):
    recommendations = []
    
    # Создание матрицы признаков на основе синопсисов (content-based)
    content_matrix = anime_data[['anime_id', 'name']].copy()  # Создаем копию столбцов "anime_id" и "name" из данных аниме
    content_matrix = content_matrix.merge(anime_synopsis, on='anime_id', how='left')  # Объединяем синопсисы аниме по столбцу "anime_id"
    content_matrix['synopsis'] = content_matrix['synopsis'].fillna('')  # Заполняем пропущенные значения в столбце "synopsis" пустой строкой
    
    tfidf = TfidfVectorizer(stop_words='english')  # Создаем объект TfidfVectorizer для создания матрицы TF-IDF
    tfidf_matrix = tfidf.fit_transform(content_matrix['synopsis'].values.astype('U'))  # Преобразуем синопсисы в TF-IDF матрицу признаков
    
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recommendations+1, n_jobs=-1)  # Инициализируем модель NearestNeighbors для поиска ближайших соседей
    knn.fit(tfidf_matrix)  # Обучаем модель на матрице признаков
    
    for word in search_words:
        anime_search = anime_data[anime_data['name'].str.contains(word)]  # Ищем аниме, в названии которого есть заданное слово
        
        if anime_search.empty:  # Если поиск не дал результатов, переходим к следующему поисковому слову
            continue
        
        anime_ids = anime_search['anime_id'].values  # Получаем список идентификаторов найденных аниме
        anime_recommendations = []  # Создаем список для хранения рекомендаций
        
        for anime_id in anime_ids:
            anime_index = content_matrix[content_matrix['anime_id'] == anime_id].index[0]  # Находим индекс аниме в матрице признаков
            distances, indices = knn.kneighbors(tfidf_matrix[anime_index], n_neighbors=n_recommendations + 1)  # Находим ближайших соседей для аниме
            indices_list = indices.squeeze().tolist()  # Преобразуем индексы в список
            distances_list = distances.squeeze().tolist()  # Преобразуем расстояния в список
            indices_distances = list(zip(indices_list, distances_list))  # Объединяем индексы и расстояния в кортежи
            indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)  # Сортируем по расстоянию
            indices_distances_sorted = indices_distances_sorted[1:]  # Исключаем первый элемент, который является самим аниме

            for ind_dist in indices_distances_sorted:
                similar_anime_index = ind_dist[0]  # Получаем индекс похожего аниме
                similar_anime_id = content_matrix.loc[similar_anime_index, 'anime_id']  # Получаем идентификатор похожего аниме
                anime_recommendations.append(similar_anime_id)  # Добавляем идентификатор в список рекомендаций
        
        anime_recommendations = list(set(anime_recommendations))[:n_recommendations]  # Извлекаем n уникальных рекомендаций
        for anime_id in anime_recommendations:
            anime_name = anime_data.loc[anime_data['anime_id'] == anime_id, 'name'].values[0]  # Получаем название аниме по идентификатору
            recommendations.append({"anime_id": anime_id, "name": anime_name})
    
    return {'recommendations': recommendations}  # Возвращаем словарь с рекомендациями


In [33]:
print(get_content_recommendations(['Naruto'], 10))

{'recommendations': [{'anime_id': 2562, 'name': 'Shion no Ou'}, {'anime_id': 36868, 'name': 'Toho Cinemas & I Love Snoopy: The Peanuts Movie Collab Logo Eizou'}, {'anime_id': 36358, 'name': 'Tsubiregg Tsuburegg March'}, {'anime_id': 36362, 'name': 'Hamburger no Tsukurikata'}, {'anime_id': 40971, 'name': 'Newsong'}, {'anime_id': 2060, 'name': 'Gall Force 1: Eternal Story'}, {'anime_id': 21517, 'name': 'Soratobu Usagi no Yuukai Boushi: Boku Iya Da yo!'}, {'anime_id': 28687, 'name': 'Poker'}, {'anime_id': 37391, 'name': 'Xi Yang Yang Yu Hui Tai Lang: Zhi Fei Ma Qi Yu Ji'}, {'anime_id': 20, 'name': 'Naruto'}]}


---

In [34]:
def merge_recommendations(search_words, n_recommendations, user_id):
    # Получение рекомендаций с использованием content-based метода
    content_based = get_content_recommendations(search_words, n_recommendations)
    
    # Получение рекомендаций с использованием user-based метода
    user_based = get_user_based_recommendations(user_id, n_recommendations)
    
    # Получение рекомендаций с использованием item-based метода
    item_based = get_recommendations(search_words, n_recommendations)
    
    # Объединение всех рекомендаций в один список
    all_recommendations = content_based['recommendations'] + user_based['recommendations'] + item_based['recommendations']
    
    # Удаление дубликатов
    unique_recommendations = []
    seen_ids = set()
    for recommendation in all_recommendations:
        anime_id = recommendation['anime_id']
        if anime_id not in seen_ids:
            unique_recommendations.append(recommendation)
            seen_ids.add(anime_id)
    
    # Сортировка по id
    unique_recommendations.sort(key=lambda x: x['anime_id'])
    
    return {'recommendations': unique_recommendations}


In [35]:
print(merge_recommendations(['Naruto', 'Bleach'], 10, 310065 ))

{'recommendations': [{'anime_id': 1, 'name': 'Cowboy Bebop'}, {'anime_id': 5, 'name': 'Cowboy Bebop: Tengoku no Tobira'}, {'anime_id': 6, 'name': 'Trigun'}, {'anime_id': 7, 'name': 'Witch Hunter Robin'}, {'anime_id': 15, 'name': 'Eyeshield 21'}, {'anime_id': 16, 'name': 'Hachimitsu to Clover'}, {'anime_id': 17, 'name': 'Hungry Heart: Wild Striker'}, {'anime_id': 18, 'name': 'Initial D Fourth Stage'}, {'anime_id': 20, 'name': 'Naruto'}, {'anime_id': 22, 'name': 'Tennis no Ouji-sama'}, {'anime_id': 30, 'name': 'Neon Genesis Evangelion'}, {'anime_id': 269, 'name': 'Bleach'}, {'anime_id': 642, 'name': 'Ichigo 100%: Koi ga Hajimaru?! Satsuei Gasshuku - Yureru Kokoro ga Higashi e Nishi e'}, {'anime_id': 762, 'name': 'Bleach: Memories in the Rain'}, {'anime_id': 834, 'name': 'Bleach: The Sealed Sword Frenzy'}, {'anime_id': 1519, 'name': 'Black Lagoon: The Second Barrage'}, {'anime_id': 1686, 'name': 'Bleach Movie 1: Memories of Nobody'}, {'anime_id': 2060, 'name': 'Gall Force 1: Eternal Story