In [14]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import SVD, KNNBasic
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [15]:
ratings = pd.read_csv('ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
#Отбираем алгоритмы из SURPRISE, которые будем обучать
algorithms = {
    'SVD': SVD(),
    'KNNBasic': KNNBasic()}

In [16]:
#В процессе обучения выполняем проверку на 5 фолдах, оцениваем RMSE
results = {}
for name, algo in algorithms.items():
    print(f"Оценка {name}...")
    cv_results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
    results[name] = cv_results['test_rmse'].mean()
results

Оценка SVD...
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8753  0.8727  0.8747  0.8730  0.8732  0.8738  0.0010  
Fit time          5.95    6.38    6.54    6.32    6.31    6.30    0.19    
Test time         1.03    1.28    1.42    1.13    1.32    1.23    0.14    
Оценка KNNBasic...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9238  0.9250  0.9210  0.9238  0.9199  0.9227  0.0019  
Fit time          25.87   24.88   25.27   24.68  

{'SVD': 0.8737689464724353, 'KNNBasic': 0.9227035635551362}

In [18]:
#Отбираем лучший алгоритм
best_algorithm_name = min(results, key=results.get)
best_rmse = results[best_algorithm_name]
print(f"Лучший алгоритм: {best_algorithm_name} с RMSE: {best_rmse}")

Лучший алгоритм: SVD с RMSE: 0.8737689464724353


In [21]:
#Тюним лучший алгоритм
if best_algorithm_name == 'SVD':
    param_grid = {
        'n_factors': [50, 500],
        'n_epochs': [20, 30],
        'lr_all': [0.001, 0.01],
        'reg_all': [0.01, 0.1]
    }
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)
    print(f"Лучшие параметры для SVD: {gs.best_params['rmse']}")
    print(f"RMSE для SVD: {gs.best_score['rmse']}")

elif best_algorithm_name == 'KNNBasic':
    param_grid = {
        'k': [5, 10, 15],
        'sim_options': {
            'name': ['msd', 'cosine'],
            'user_based': [True, False]  # Сравниваем пользователей или предметы
        }
    }
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)
    print(f"Лучшие параметры для KNNBasic: {gs.best_params['rmse']}")
    print(f"RMSE для KNNBasic: {gs.best_score['rmse']}")

Лучшие параметры для SVD: {'n_factors': 500, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
RMSE для SVD: 0.8699951875043348
