In [1]:
import pandas as pd
import numpy as np
from implicit.evaluation import train_test_split
from surprise.model_selection import train_test_split
from tqdm import tqdm
from src.mapk import *
from sklearn.model_selection import train_test_split




#Data Preprocessing

In [2]:
# Файлы
INPUT_DIR = 'C:/Dataset'

In [3]:
# Чтение файлов
anime_data = pd.read_csv(INPUT_DIR + '/movieData.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["movie_id", "Name"]
                        )
anime_ratings = pd.read_csv(INPUT_DIR + '/movieRatings.csv',
                        low_memory=False,
                        decimal=',',
                        usecols=["user_id", "movie_id","rating"]
                        ).astype({'rating': 'float32'})

In [4]:
# (80% train, 20% test)
train_ratings, test_ratings = train_test_split(anime_ratings, test_size=0.2, random_state=42)

# (60% train, 20% validation)
train_ratings, val_ratings = train_test_split(train_ratings, test_size=0.2, random_state=42)

# (60% train, 20% leftovers)
train_ratings, leftovers_ratings = train_test_split(train_ratings, test_size=0.2, random_state=42)

print(f"Размер train_ratings: {len(train_ratings)}")
print(f"Размер val_ratings: {len(val_ratings)}")
print(f"Размер test_ratings: {len(test_ratings)}")
print(f"Размер leftovers: {len(leftovers_ratings)}")

Размер train_ratings: 51627
Размер val_ratings: 16134
Размер test_ratings: 20168
Размер leftovers: 12907


In [5]:
# Заполнение пропущенных значений
train_ratings.fillna(0, inplace=True)
val_ratings.fillna(0, inplace=True)
test_ratings.fillna(0, inplace=True)

In [6]:
# Кодирование категориальных признаков
user_ids = test_ratings.user_id.drop_duplicates().values
anime_ids = test_ratings.movie_id.drop_duplicates().values

test_ratings['user_id'] = test_ratings['user_id'].astype('category').cat.codes
test_ratings['movie_id'] = test_ratings['movie_id'].astype('category').cat.codes


In [7]:
n_ratings = test_ratings['user_id'].value_counts()
test_ratings = test_ratings[test_ratings['user_id'].isin(n_ratings[n_ratings >= 5].index)].copy()
len(test_ratings)

19993

Test Dataset

In [8]:
# Удаление дубликатов
test_ratings.drop_duplicates(subset=['user_id', 'movie_id'], inplace=True)

In [9]:
# Разложение матрицы
k = 50
max_epochs = 50
gamma = 0.01
lamda = 0.1

P = np.random.rand(len(user_ids), k)
Q = np.random.rand(len(anime_ids), k)

for epoch in range(max_epochs):
    for i, j, r in zip(test_ratings['user_id'], test_ratings['movie_id'], test_ratings['rating']):
        error = r - np.dot(P[i, :], Q[j, :].T)
        P[i, :] += gamma * (error * Q[j, :] - lamda * P[i, :])
        Q[j, :] += gamma * (error * P[i, :] - lamda * Q[j, :])
    
    # Получаем предсказанные оценки для неоцененных аниме
    predicted_ratings = np.dot(P, Q.T)
    predicted_ratings

In [10]:
# Получаем топ 10 рекомендаций для каждого пользователя
top_n = 10
recommendations = {}
for i in tqdm(range(len(user_ids))):
    top_anime_idx = np.argsort(predicted_ratings[i, :])[::-1][:top_n]
    top_anime_ids = [anime_ids[anime_idx] for anime_idx in top_anime_idx]
    recommendations[user_ids[i]] = top_anime_ids

100%|██████████| 610/610 [00:00<00:00, 3092.44it/s]


In [11]:
# Получаем фактические оценки аниме для каждого пользователя
actual_ratings = {}
for i in range(len(user_ids)):
    user_df = test_ratings[test_ratings['user_id'] == i]
    rated_anime = list(user_df['movie_id'].values)
    rating_scores = list(user_df['rating'].values)
    actual_ratings[user_ids[i]] = dict(zip(rated_anime, rating_scores))


In [12]:
def mapk(actual, predicted, k=10):
    """
    Mean Average Precision at K
    """
    mapk_scores = []
    for user in actual:
        actual_anime = list(actual[user].keys())
        predicted_anime = predicted[user]
        predicted_anime = [anime for anime in predicted_anime if anime in actual_anime]
        
        if len(predicted_anime) == 0:
            continue
        
        predicted_anime = predicted_anime[:k]
        score = 0.0
        num_hits = 0.0
        for i, p in enumerate(predicted_anime):
            if p in actual_anime:
                num_hits += 1
                score += num_hits / (i+1)
        if num_hits > 0:
            score /= min(len(actual_anime), k)
            mapk_scores.append(score)
    return np.mean(mapk_scores)

In [13]:
mapk_score = mapk(actual_ratings, recommendations, k=10)
print(f"Mean MAPK@10 score: {mapk_score:.4f}")

Mean MAPK@10 score: 0.1071


-------------