In [1]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy

In [2]:
personal = pd.read_parquet("final_recommendations_feat.parquet")
tops_rec = pd.read_parquet("top_recs.parquet")

In [3]:
personal.info

<bound method DataFrame.info of           user_id   item_id  als_score  cnt_score  cb_score  rank   age  \
20        1000007   9969571   0.650620        NaN  0.681197     1   7.0   
24        1000007  22557272   0.795854        NaN  0.653660     2   3.0   
10        1000007   9361589   0.637782        NaN  0.612345     3   7.0   
0         1000007  18143977   1.044727        NaN  0.607587     4   4.0   
103       1000007     91439        NaN   0.799560  0.588666     5   1.0   
...           ...       ...        ...        ...       ...   ...   ...   
14520145  1430468      6185   0.145202        NaN  0.090999    96  16.0   
14520167  1430468      9014   0.096063   0.951634  0.083508    97  19.0   
14520249  1430468     32419        NaN   0.891957  0.068369    98  13.0   
14520165  1430468    830502   0.132229        NaN  0.055526    99  31.0   
14520141  1430468     48855   0.117985        NaN  0.054740   100  25.0   

          average_rating  reading_years  books_read  ...   genre_1 

In [4]:
tops_rec.info

<bound method DataFrame.info of      item_id  users  avg_rating  popularity_weighted  \
0   22557272  40690    3.788965             154173.0   
1   29056083  25785    3.801784              98029.0   
2   18007564  20207    4.321275              87320.0   
3   18143977  19462    4.290669              83505.0   
4   16096824  16770    4.301014              72128.0   
..       ...    ...         ...                  ...   
95  15704307   5322    4.410936              23475.0   
96  22318578   6451    3.626104              23392.0   
97  17378508   5284    4.335541              22909.0   
98  23848559   5592    4.065629              22735.0   
99  18081809   6283    3.596689              22598.0   

                                     author  \
0                             Paula Hawkins   
1   John Tiffany, Jack Thorne, J.K. Rowling   
2                                 Andy Weir   
3                             Anthony Doerr   
4                             Sarah J. Maas   
..           

In [6]:
tops_rec.head(10)

Unnamed: 0,item_id,users,avg_rating,popularity_weighted,author,title,genre_and_votes,publication_year,score,rank
0,22557272,40690,3.788965,154173.0,Paula Hawkins,The Girl on the Train,"{'Fiction': 9793, 'Mystery': 9190, 'Thriller':...",2015.0,1.0,1
1,29056083,25785,3.801784,98029.0,"John Tiffany, Jack Thorne, J.K. Rowling",Harry Potter and the Cursed Child - Parts One ...,"{'Fantasy': 14466, 'Fiction': 4232, 'Young Adu...",2016.0,0.5,2
2,18007564,20207,4.321275,87320.0,Andy Weir,The Martian,"{'Science Fiction': 11966, 'Fiction': 8430}",2014.0,0.333333,3
3,18143977,19462,4.290669,83505.0,Anthony Doerr,All the Light We Cannot See,"{'Historical-Historical Fiction': 13679, 'Fict...",2014.0,0.25,4
4,16096824,16770,4.301014,72128.0,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman...",2015.0,0.2,5
5,3,15139,4.706057,71245.0,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad...",1997.0,0.166667,6
6,9460487,16564,3.809949,63108.0,Ransom Riggs,Miss Peregrine’s Home for Peculiar Children (M...,"{'Fantasy': 12454, 'Young Adult': 9293, 'Ficti...",,0.142857,7
7,38447,14611,4.23277,61845.0,Margaret Atwood,The Handmaid's Tale,"{'Fiction': 15424, 'Classics': 9937, 'Science ...",1998.0,0.125,8
8,15881,13043,4.632447,60421.0,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict...",1999.0,0.111111,9
9,11235712,14348,4.179189,59963.0,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)","{'Young Adult': 10539, 'Fantasy': 9237, 'Scien...",2012.0,0.1,10


In [None]:
# Загрузка данных
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Точка разбиения для тренировочного и тестового наборов
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()
train_test_global_time_split_idx = events["started_at_month"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# Количество уникальных пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
common_users = pd.Index(users_train).intersection(pd.Index(users_test))
cold_users = pd.Index(users_test).difference(pd.Index(users_train))

# Удаление редких айтемов (с которыми взаимодействовало менее 2 пользователей)
item_activity = events_train.groupby('item_id')['user_id'].nunique().reset_index(name='user_count')
items_to_keep = item_activity[item_activity['user_count'] >= 2]['item_id']
events_train_filtered = events_train[events_train['item_id'].isin(items_to_keep)]

In [None]:
items_to_keep.head(10)

In [None]:
events_train_filtered.head(10)

In [None]:
# Получаем данные для рекомендаций
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train_filtered[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

In [None]:
# Инициализация и обучение модели SVD
svd_model = SVD(n_factors=100, random_state=0)
svd_model.fit(surprise_train_set)

In [None]:
# Получаем предсказания для тестовой выборки
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))
svd_predictions = svd_model.test(surprise_test_set)

In [None]:
# Оценка рекомендаций
rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)

In [None]:
# Выводим результаты
print(f"RMSE для SVD модели после удаления редких айтемов: {rmse}")
print(f"MAE для SVD модели после удаления редких айтемов: {mae}")

# Сравнение с предыдущими результатами
# Предыдущие RMSE и MAE (из вашего исходного кода)
previous_rmse = 1.25  # Замените на актуальное значение из вашего исходного кода
previous_mae = 0.99  # Замените на актуальное значение из вашего исходного кода

print(f"Изменение RMSE: {rmse - previous_rmse}")
print(f"Изменение MAE: {mae - previous_mae}")