In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from pymongo import MongoClient

In [2]:
client = MongoClient("mongodb://root:password@localhost:27017/")

db = client["anime"]
collection = db["animelist"]

In [3]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [4]:
# Чтение файлов
anime_ratings = pd.read_csv(INPUT_DIR + '/rating_complete.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id","anime_id","rating"]
                        )

In [5]:
# (60% train, 40% test)
anime_ratings, train_ratings = train_test_split(anime_ratings, test_size=0.6, random_state=42)

# (50% train, 50% test)
train_ratings, test_ratings = train_test_split(train_ratings, test_size=0.5, random_state=42)

In [6]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

749513

In [7]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

747721

In [8]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()
test_ratings = test_ratings.drop_duplicates()

In [9]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix_train = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_train.head(3)

user_id,781,890,1177,1397,1469,1946,3578,4773,5045,5648,...,350215,350286,351119,351361,351696,351801,352301,352761,352922,352930
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,5.0,,8.0,,,,...,,9.0,,,,,,,,
5,,,8.0,,,,,,,8.0,...,,,,8.0,,,9.0,,,
6,,,,,,,,8.0,,,...,,,,5.0,,,,,,


In [10]:
# NaN преобразовываю в нули
user_item_matrix_train.fillna(0, inplace = True)

In [11]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data_train = csr_matrix(user_item_matrix_train.values)

In [12]:
# Сброшу индекс с помощью reset_index()
user_item_matrix_train = user_item_matrix_train.rename_axis(None, axis = 1).reset_index()

# Load Data from DB

In [13]:
# Импорт модуля functools для использования декоратора lru_cache
from functools import lru_cache

# Получение данных об аниме с кэшированием результатов
@lru_cache(maxsize=None)
def load_anime_data():
    anime_data = []
    for document in collection.find():
        anime_id = document.get('anime_id')
        title = document.get('title')
        title_japanese = document.get('title_japanese')
        cover = document.get('cover')
        anime_type = document.get('type')
        episodes = document.get('episodes')
        airing = document.get('airing')
        aired_from = document.get('aired_from')
        aired_to = document.get('aired_to')
        duration = document.get('duration')
        synopsis = document.get('synopsis')
        producers = document.get('producers')
        studios = document.get('studios')
        genres = document.get('genres')

        anime_data.append({
            'anime_id': anime_id,
            'title': title,
            'title_japanese': title_japanese,
            'cover': cover,
            'type': anime_type,
            'episodes': episodes,
            'airing': airing,
            'aired_from': aired_from,
            'aired_to': aired_to,
            'duration': duration,
            'synopsis': synopsis,
            'producers': producers,
            'studios': studios,
            'genres': genres
        })
    return anime_data

# Item Based

In [14]:
def get_item_based_recommendations(search_words, n_recommendations=10):
    anime_data = load_anime_data()  # Загрузка данных

    recommendations = []
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data_train)

    for word in search_words:
        # Фильтрация аниме по заданному слову в заголовке
        anime_search = [anime for anime in anime_data if word in anime['title']]
        if not anime_search:
            continue
        anime_id = anime_search[0]['anime_id']

        # Преобразование anime_id в индекс матрицы
        anime_id = user_item_matrix_train[user_item_matrix_train['anime_id'] == anime_id].index[0]

        # Поиск ближайших соседей и расстояний до них
        distances, indices = knn.kneighbors(csr_data_train[anime_id], n_neighbors=n_recommendations + 1)
        indices_list = indices.squeeze().tolist()[1:]
        distances_list = distances.squeeze().tolist()[1:]
        indices_distances = list(zip(indices_list, distances_list))

        # Получение рекомендаций и добавление их в список
        for ind_dist in indices_distances:
            anime_id = int(user_item_matrix_train.iloc[ind_dist[0]]['anime_id'])
            anime = next((anime for anime in anime_data if anime['anime_id'] == anime_id), None)
            if anime:
                recommendations.append(anime)

    return {'recommendations': recommendations[:n_recommendations]}

In [15]:
print(get_item_based_recommendations(['Naruto', 'Bleach'], 3))

{'recommendations': [{'anime_id': 4437, 'title': 'Naruto: Shippuuden Movie 2 - Kizuna', 'title_japanese': 'åŠ‡å\xa0´ç‰ˆNARUTO-ãƒŠãƒ«ãƒˆ- ç–¾é¢¨ä¼\x9d çµ†', 'cover': 'https://cdn.myanimelist.net/images/anime/1484/134494.jpg', 'type': 'Movie', 'episodes': 1.0, 'airing': 0.0, 'aired_from': '2008-08-02T00:00:00+00:00', 'aired_to': None, 'duration': '1 hr 32 min', 'synopsis': "Unleashing a devastating surprise attack, flying ninjas from the Land of Sky are seeking revenge against their old enemy Konohagakure. Despite his eagerness to join the fight, Naruto Uzumaki is held up by Shinnou, a mysterious doctor who requires his assistance to save an injured person. While delivering the wounded man to the hospital, Naruto has an unexpected encounter with Amaruâ€”a stormy youngster from a neighboring village desperately looking for Shinnou's help.\\n\\nMeanwhile, as the invaders withdraw to restore their forces, Tsunade seizes the opportunity to dispatch a small team including Sai, Shikamaru Nara,

# User Based

In [16]:
# Создание матрицы пользователь-аниме
user_anime_matrix = csr_matrix((train_ratings['rating'],
                                (train_ratings['user_id'], train_ratings['anime_id'])))

# Создание модели NearestNeighbors
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(user_anime_matrix)

In [17]:
def get_user_based_recommendations(user_id, n_recommendations=10):
    # Загрузка данных об аниме
    anime_data = load_anime_data()

    # Получение оценок выбранного пользователя
    user_rated_anime = train_ratings[train_ratings['user_id'] == user_id]['anime_id'].unique()

    # Нахождение индексов наиболее похожих пользователей
    similar_users = model.kneighbors(user_anime_matrix[user_id], n_neighbors=n_recommendations)[1].flatten()

    # Получение списка аниме, оцененных найденными похожими пользователями
    similar_anime = train_ratings[train_ratings['user_id'].isin(similar_users)]['anime_id'].unique()

    # Исключение аниме, которые уже оценил выбранный пользователь
    recommended_anime = [anime_id for anime_id in similar_anime if anime_id not in user_rated_anime]

    # Получение данных о рекомендуемом аниме
    recommended_anime_data = [anime for anime in anime_data if anime['anime_id'] in recommended_anime]

    # Список рекомендуемого аниме
    return {'recommendations': recommended_anime_data[:n_recommendations]}


In [18]:
train_ratings.head(2)

Unnamed: 0,user_id,anime_id,rating
50579508,310065,32900,7
1733703,10851,19023,5


In [19]:
# Получение рекомендаций для пользователя
print(get_user_based_recommendations(310065, 4))

{'recommendations': [{'anime_id': 1, 'title': 'Cowboy Bebop', 'title_japanese': 'ã‚«ã‚¦ãƒœãƒ¼ã‚¤ãƒ“ãƒ\x90ãƒƒãƒ—', 'cover': 'https://cdn.myanimelist.net/images/anime/4/19644.jpg', 'type': 'TV', 'episodes': 26.0, 'airing': 0.0, 'aired_from': '1998-04-03T00:00:00+00:00', 'aired_to': '1999-04-24T00:00:00+00:00', 'duration': '24 min per ep', 'synopsis': "Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\\n\\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, od

# Content Based

In [20]:
def get_content_recommendations(search_words, n_recommendations=10):
    anime_data = load_anime_data()  # Загружаем данные 

    # Создание матрицы признаков на основе synopsis (content-based)
    content_matrix = pd.DataFrame(anime_data)  # Создаем DataFrame из данных аниме
    content_matrix['synopsis'] = content_matrix['synopsis'].fillna('')  # Заполняем пропущенные значения в столбце "synopsis" пустой строкой

    tfidf = TfidfVectorizer(stop_words='english')  # Создаем объект TfidfVectorizer для создания матрицы TF-IDF
    tfidf_matrix = tfidf.fit_transform(content_matrix['synopsis'].values.astype('U'))  # Преобразуем synopsis в TF-IDF матрицу признаков

    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_recommendations+1, n_jobs=-1)  # Инициализируем модель NearestNeighbors для поиска ближайших соседей
    knn.fit(tfidf_matrix)  # Обучаем модель на матрице признаков

    recommendations = []

    for word in search_words:
        anime_search = content_matrix[content_matrix['title'].str.contains(word, case=False)]  # Ищем аниме, в названии которого есть заданное слово (без учета регистра)

        if anime_search.empty:
            continue

        anime_ids = anime_search['anime_id'].values
        anime_recommendations = []

        for anime_id in anime_ids:
            anime_index = content_matrix[content_matrix['anime_id'] == anime_id].index[0]
            distances, indices = knn.kneighbors(tfidf_matrix[anime_index], n_neighbors=n_recommendations + 1)
            indices_list = indices.squeeze()[1:].tolist()  # Исключаем первый элемент, который является самим аниме
            anime_recommendations.extend(indices_list)

        anime_recommendations = list(set(anime_recommendations))[:n_recommendations]  # Извлекаем n уникальных рекомендаций

        for anime_index in anime_recommendations:
            anime_info = content_matrix.loc[anime_index].to_dict()
            if anime_info['anime_id'] not in anime_ids:
                recommendations.append(anime_info)

    return {'recommendations': recommendations[:n_recommendations]}


In [21]:
print(get_content_recommendations(['Bleach', 'Naruto'], 4))

{'recommendations': [{'anime_id': 553, 'title': 'Yami no Matsuei', 'title_japanese': 'é—‡ã\x81®æœ«è£”', 'cover': 'https://cdn.myanimelist.net/images/anime/1414/109717.jpg', 'type': 'TV', 'episodes': 13.0, 'airing': 0.0, 'aired_from': '2000-10-02T00:00:00+00:00', 'aired_to': '2000-12-18T00:00:00+00:00', 'duration': '23 min per ep', 'synopsis': "Even after death, life is full of paperwork and criminals. Tsuzuki Asato is a 26 year old, happy-go-lucky, and dorky shinigami (god of death) whose job is to makes sure that those who are dead remain dead and stay in their proper realms. Even though he's had this job for over 70 years, he is in the worst division with horrible pay. He also has a knack for not keeping partners (since shinigami work in pairs), but now he seems to have one that will stick around", 'producers': None, 'studios': None, 'genres': None}, {'anime_id': 3560, 'title': 'Karen', 'title_japanese': 'ã‚«ãƒ¬ãƒ³', 'cover': 'https://cdn.myanimelist.net/images/anime/1418/112580.jpg'

# Hybrid

In [22]:
def hybrid_recommendations(search_words, n_recommendations, user_id, recommendations_count):
    # Получение рекомендаций с использованием content-based метода
    content_based = get_content_recommendations(search_words, recommendations_count)
    
    # Получение рекомендаций с использованием user-based метода
    user_based = get_user_based_recommendations(user_id, recommendations_count)
    
    # Получение рекомендаций с использованием item-based метода
    item_based = get_item_based_recommendations(search_words, recommendations_count)
    
    # Объединение всех рекомендаций в один список
    all_recommendations = content_based['recommendations'] + user_based['recommendations'] + item_based['recommendations']
    
    # Удаление дубликатов
    unique_recommendations = list({rec['anime_id']: rec for rec in all_recommendations}.values())
    
    # Сортировка по id
    unique_recommendations.sort(key=lambda x: x['anime_id'])
    
    return {'recommendations': unique_recommendations[:recommendations_count]}


In [23]:
print(hybrid_recommendations(['Naruto', 'Bleach'], 10, 310065, 4 ))

{'recommendations': [{'anime_id': 1, 'title': 'Cowboy Bebop', 'title_japanese': 'ã‚«ã‚¦ãƒœãƒ¼ã‚¤ãƒ“ãƒ\x90ãƒƒãƒ—', 'cover': 'https://cdn.myanimelist.net/images/anime/4/19644.jpg', 'type': 'TV', 'episodes': 26.0, 'airing': 0.0, 'aired_from': '1998-04-03T00:00:00+00:00', 'aired_to': '1999-04-24T00:00:00+00:00', 'duration': '24 min per ep', 'synopsis': "Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\\n\\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, od