In [72]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from src.mapk import *
from sklearn.model_selection import train_test_split

#Data Preprocessing

In [73]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [74]:
# Define the function to process each data chunk
def process_data_chunk(chunk):
    # Placeholder function, modify it with your actual processing logic
    processed_data = chunk + 1
    return processed_data

In [75]:
# Чтение файлов с указанием типов данных и оптимизация чтения
anime_ratings_chunks = pd.read_csv(
    INPUT_DIR + '/rating_complete.csv',
    usecols=["user_id", "anime_id", "rating"],
    dtype={"user_id": "int32", "anime_id": "int32", "rating": "int16"},
    engine='c',
    low_memory=True,
    chunksize=10000  # Размер блока (порции) для чтения
)

anime_data = pd.read_csv(
    INPUT_DIR + '/anime.csv',
    engine='c',
    low_memory=True
)
# Concatenate the chunks into a single DataFrame
anime_ratings = pd.concat(anime_ratings_chunks)

# Process each chunk of data
for chunk in anime_ratings_chunks:
    processed_chunk = process_data_chunk(chunk)
    # Do further operations with the processed chunk if needed
    print(processed_chunk.head())  # Example: Print the processed chunk's head

In [76]:
# Содержимое anime.csv
anime_data.head(3)

Unnamed: 0,anime_id,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170,182126,131625,62330,20688,8904,3184,1357,741,1580
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,1-Sep-01,Unknown,...,30043,49201,49505,22632,5805,1877,577,221,109,379
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229,75651,86142,49432,15376,5838,1965,664,316,533


In [77]:
# (60% train, 40% test)
train_ratings, test_ratings = train_test_split(anime_ratings, test_size=0.4, random_state=42)

# (50% test, 50% validation)
test_ratings, val_ratings = train_test_split(test_ratings, test_size=0.5, random_state=42)

In [78]:
# anime_ratings
len(anime_ratings)


57633278

In [79]:
# train_ratings
len(train_ratings)

34579966

In [80]:
# val_ratings
len(val_ratings)

11526656

In [81]:
# test_ratings
len(test_ratings)

11526656

In [82]:
# Cодержимое anime_ratings.csv
anime_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7


In [83]:
# Пользователь должен оценить минимум 500 аниме (train_ratings)
ntrain_ratings = train_ratings['user_id'].value_counts()
train_ratings = train_ratings[train_ratings['user_id'].isin(ntrain_ratings[ntrain_ratings >= 500].index)].copy()
len(train_ratings)

5505758

In [84]:
# Пользователь должен оценить минимум 500 аниме (test_ratings)
ntest_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(ntest_ratings[ntest_ratings >= 500].index)].copy()
len(test_ratings)

209255

In [85]:
# Пользователь должен оценить минимум 500 аниме (val_ratings)
nval_ratings = val_ratings['user_id'].value_counts()
val_ratings = val_ratings[val_ratings['user_id'].isin(nval_ratings[nval_ratings >= 500].index)].copy()
len(val_ratings)

207649

In [86]:
# Удаление Duplicated Rows
train_ratings = train_ratings.drop_duplicates()
test_ratings = test_ratings.drop_duplicates()
val_ratings = val_ratings.drop_duplicates()

In [87]:
train_ratings.head(3)

Unnamed: 0,user_id,anime_id,rating
36686360,225127,9587,2
24727501,152175,17873,7
25338517,156126,11785,8


In [88]:
# Создание сводной таблицы (pivot table). 
# По горизонтали будут аниме, по вертикали - пользователи, значения - оценки
user_item_matrix_train = train_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_test = test_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')
user_item_matrix_val = val_ratings.pivot(index = 'anime_id', columns = 'user_id', values= 'rating')

user_item_matrix_train.head()

user_id,314,326,371,446,478,601,603,694,730,781,...,352887,352922,352924,352930,353012,353134,353304,353311,353325,353326
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,9.0,,,,,,,7.0,...,,,,10.0,,,,,,
5,,10.0,9.0,,,10.0,,8.0,,,...,,,9.0,7.0,,,,,,
6,,8.0,,7.0,,9.0,,,,10.0,...,,,,10.0,,,7.0,,,
7,,8.0,,,5.0,,,,,,...,,,,,,,,,,
8,,,,,,,,7.0,,,...,,,,,,,,,,


In [89]:
# NaN преобразовываю в нули
user_item_matrix_train.fillna(0, inplace = True)
user_item_matrix_test.fillna(0, inplace = True)
user_item_matrix_val.fillna(0, inplace = True)
user_item_matrix_train.head()

user_id,314,326,371,446,478,601,603,694,730,781,...,352887,352922,352924,352930,353012,353134,353304,353311,353325,353326
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,10.0,9.0,0.0,0.0,10.0,0.0,8.0,0.0,0.0,...,0.0,0.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,8.0,0.0,7.0,0.0,9.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,10.0,0.0,0.0,7.0,0.0,0.0,0.0
7,0.0,8.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# Размерность матрицы "пользователи х anime"
user_item_matrix_train.shape

(16777, 7161)

In [91]:
# Преобразую разреженную матрицу в формат csr
# Метод values передаст функции csr_matrix только значения датафрейма
csr_data_train = csr_matrix(user_item_matrix_train.values)
csr_data_test = csr_matrix(user_item_matrix_test.values)
csr_data_tval = csr_matrix(user_item_matrix_val.values)


In [92]:
user_item_matrix_train.head()

user_id,314,326,371,446,478,601,603,694,730,781,...,352887,352922,352924,352930,353012,353134,353304,353311,353325,353326
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,10.0,9.0,0.0,0.0,10.0,0.0,8.0,0.0,0.0,...,0.0,0.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,8.0,0.0,7.0,0.0,9.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,10.0,0.0,0.0,7.0,0.0,0.0,0.0
7,0.0,8.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
# Сброшу индекс с помощью reset_index()
user_item_matrix_train = user_item_matrix_train.rename_axis(None, axis = 1).reset_index()
user_item_matrix_test = user_item_matrix_test.rename_axis(None, axis = 1).reset_index()
user_item_matrix_val = user_item_matrix_val.rename_axis(None, axis = 1).reset_index()

user_item_matrix_train.head()

Unnamed: 0,anime_id,314,326,371,446,478,601,603,694,730,...,352887,352922,352924,352930,353012,353134,353304,353311,353325,353326
0,1,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0.0,10.0,9.0,0.0,0.0,10.0,0.0,8.0,0.0,...,0.0,0.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.0,8.0,0.0,7.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,0.0,7.0,0.0,0.0,0.0
3,7,0.0,8.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#MAPK TOP 10 Recommendations

In [94]:
train_ratings.info

<bound method DataFrame.info of           user_id  anime_id  rating
36686360   225127      9587       2
24727501   152175     17873       7
25338517   156126     11785       8
34609599   212486     38098       7
21459498   131803     13655       9
...           ...       ...     ...
52286002   320669     33994       5
16861870   103523      8917       5
46792155   286933     16592       9
48140618   295130     30187       7
26735830   164525      6974       6

[5505758 rows x 3 columns]>

In [95]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = anime_not_rated[:10] # использую топ-10 рекомендаций
    user_recommendations[user_id] = recommended_anime


In [96]:
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))


In [97]:
# Расчет MAPK@10
mapk_test_before_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_before_score)

MAPK@10 score:  0.09671826850398278


#Item Based Collaborative Filtering Recommendation System 

#Top 10 anime recommendations based on user preferences

In [98]:
# Функции, которая возвращает результаты по каждому поисковому аниме отдельно, 
# а также объединенный набор данных с 10 лучшими рекомендациями по всем поисковым аниме на основе наименьшего расстояния:
def get_recommendations(search_words, recommendations=10):
    result = {}
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    knn.fit(csr_data_train)
    all_recoms = []
    for word in search_words:
        anime_search = anime_data[anime_data['Name'].str.contains(word)]
        anime_id = anime_search.iloc[0]['anime_id']
        anime_id = user_item_matrix_train[user_item_matrix_train['anime_id'] == anime_id].index[0]
        distances, indices = knn.kneighbors(csr_data_train[anime_id], n_neighbors=recommendations + 1)
        indices_list = indices.squeeze().tolist()
        distances_list = distances.squeeze().tolist()
        indices_distances = list(zip(indices_list, distances_list))
        indices_distances_sorted = sorted(indices_distances, key=lambda x: x[1], reverse=False)
        indices_distances_sorted = indices_distances_sorted[1:]
        recom_list = []
        for ind_dist in indices_distances_sorted:
            matrix_movie_id = user_item_matrix_train.iloc[ind_dist[0]]['anime_id']
            id = anime_data[anime_data['anime_id'] == matrix_movie_id].index
            Name = anime_data.iloc[id]['Name'].values[0]
            dist = ind_dist[1]
            recom_list.append({'Name': Name, 'Distance': dist})
        recom_df = pd.DataFrame(recom_list, index=range(1, recommendations + 1))
        recom_df.drop(recom_df[recom_df['Name'].isin(search_words)].index, inplace=True)
        result[word] = recom_df
        all_recoms.extend(recom_list)
    all_recoms_df = pd.DataFrame(all_recoms)
    all_recoms_df.drop(all_recoms_df[all_recoms_df['Name'].isin(search_words)].index, inplace=True)
    all_recoms_df = all_recoms_df.sort_values(by=['Distance'], ascending=True).head(recommendations)
    return result, all_recoms_df


In [99]:
search_words = ['Bleach', 'Naruto']
recommendations = 10
results, top_all_recoms = get_recommendations(search_words, recommendations)


In [100]:
print(results['Bleach'])

                                                 Name  Distance
1                          Bleach Movie 4: Jigoku-hen  0.524367
2                                          Fairy Tail  0.530206
4   Bleach Movie 3: Fade to Black - Kimi no Na wo ...  0.534748
5   Bleach Movie 2: The DiamondDust Rebellion - Mo...  0.538845
6                  Bleach Movie 1: Memories of Nobody  0.539614
7                                      Ao no Exorcist  0.544533
8                                          Death Note  0.548391
9                    Fullmetal Alchemist: Brotherhood  0.557603
10                                 Shingeki no Kyojin  0.558816


In [101]:
print(results['Naruto'])

                                Name  Distance
1                         Death Note  0.504439
2                 Shingeki no Kyojin  0.515048
3                     Ao no Exorcist  0.516811
4                   Sword Art Online  0.518397
5   Fullmetal Alchemist: Brotherhood  0.520028
6                      One Punch Man  0.525688
7    Code Geass: Hangyaku no Lelouch  0.528066
8                          Toradora!  0.530730
9                 Naruto: Shippuuden  0.530904
10                       Tokyo Ghoul  0.531359


In [102]:
# Объединенный набор с 10 лучшими рекомендациями по всем поисковым аниме, не учитывающий выбранные аниме
print(top_all_recoms)

                                Name  Distance
10                        Death Note  0.504439
11                Shingeki no Kyojin  0.515048
12                    Ao no Exorcist  0.516811
13                  Sword Art Online  0.518397
14  Fullmetal Alchemist: Brotherhood  0.520028
0         Bleach Movie 4: Jigoku-hen  0.524367
15                     One Punch Man  0.525688
16   Code Geass: Hangyaku no Lelouch  0.528066
1                         Fairy Tail  0.530206
17                         Toradora!  0.530730


#MAPK TOP 10 recommendations using the K-Nearest Neighbors algorithm (KNN)

In [103]:
# Создание датасета с топ 10 рекомендациями для каждого пользователя
user_recommendations = {}

for user_id in test_ratings['user_id'].unique():
    user_ratings = train_ratings[train_ratings['user_id'] == user_id]
    anime_not_rated = list(set(train_ratings['anime_id'].unique()) - set(user_ratings['anime_id'].unique()))
    recommended_anime = []
    for word in search_words:
        recommended_anime.extend(results[word].index.tolist())
    recommended_anime = list(set(recommended_anime))
    user_recommendations[user_id] = recommended_anime[:10] # использую топ-10 рекомендаций
# Создание списка рекомендованных и реальных оценок для каждого пользователя
recommended_ratings = []
actual_ratings_list = []

for user_id in user_recommendations:
    recommended_ratings.append(user_recommendations[user_id])
    actual_ratings_list.append(list(test_ratings[test_ratings['user_id'] == user_id]['anime_id']))
# Расчет MAPK@10
mapk_test_after_score = mapk(actual_ratings_list, recommended_ratings, k=10)
print("MAPK@10 score: ", mapk_test_after_score)


MAPK@10 score:  0.027172510029652885


#Comparison of MAPK

In [104]:
percent_increase = ((mapk_test_before_score - mapk_test_after_score) / mapk_test_after_score) * 100 if mapk_test_after_score != 0 else 100
print(f"mapk (топ 10 найденных с помощью KNN) на {percent_increase:.2f}% больше mapk (топ 10 наиболее популярных)")

mapk (топ 10 найденных с помощью KNN) на 255.94% больше mapk (топ 10 наиболее популярных)


-------