In [1]:
from surprise import Dataset
from surprise.accuracy import rmse
from surprise import NormalPredictor, BaselineOnly, KNNWithMeans, KNNBasic, KNNWithZScore
from surprise import KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import numpy as np
from tqdm import tqdm


In [2]:
# Загружаем датасет
data_100K = Dataset.load_builtin()

Создадим список алгоритмов и параметров для них. 

Потом прогоним все это через GridSearchCV


In [3]:
algoritms = [
    NormalPredictor, BaselineOnly, KNNWithMeans, KNNBasic, KNNWithZScore,  KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering
]

In [4]:
res = []
for alg in algoritms:
    t = cross_validate(alg(), data_100K, measures=["rmse"], cv=5, verbose=True)
    res.append([alg, np.mean(t['test_rmse'])])

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5174  1.5295  1.5160  1.5206  1.5160  1.5199  0.0051  
Fit time          0.03    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.03    0.06    0.05    0.02    0.02    0.04    0.01    
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9473  0.9436  0.9316  0.9524  0.9453  0.9440  0.0069  
Fit time          0.04    0.06    0.06    0.05    0.06    0.05    0.01    
Test time         0.05    0.05    0.02    0.05    0.02    0.04    0.01    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing

In [5]:
res

[[surprise.prediction_algorithms.random_pred.NormalPredictor,
  1.5199105125384385],
 [surprise.prediction_algorithms.baseline_only.BaselineOnly,
  0.9440175624049573],
 [surprise.prediction_algorithms.knns.KNNWithMeans, 0.9505668043252349],
 [surprise.prediction_algorithms.knns.KNNBasic, 0.9781620396733312],
 [surprise.prediction_algorithms.knns.KNNWithZScore, 0.9508921416105706],
 [surprise.prediction_algorithms.knns.KNNBaseline, 0.9304723086859692],
 [surprise.prediction_algorithms.matrix_factorization.SVD, 0.9361900383094957],
 [surprise.prediction_algorithms.matrix_factorization.SVDpp,
  0.9188021245728724],
 [surprise.prediction_algorithms.matrix_factorization.NMF, 0.964119263834051],
 [surprise.prediction_algorithms.slope_one.SlopeOne, 0.9447882861965047],
 [surprise.prediction_algorithms.co_clustering.CoClustering,
  0.9646857623131571]]

Наиболее перспективным выглядит SVDpp.

Будем его исследовать

In [47]:
param_grid = {
    'n_factors': [10],
    'n_epochs': [20],
    'lr_all': [0.003, 0.007, 0.01, 0.013],
    'reg_all': [0.01, 0.02, 0.03, 0.04 ]
}

In [48]:
gs = GridSearchCV(SVDpp, param_grid, measures=["rmse"], cv=3, joblib_verbose=3)
gs.fit(data_100K)




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  3.8min finished


In [50]:
gs.best_score

{'rmse': 0.924399843741685}

In [51]:
gs.best_params

{'rmse': {'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.007, 'reg_all': 0.03}}

Тут я потерпел неудачу. Код крутился более 10 часов, но так ничего и не выдал. 

Возвращаемся к KNNBaseline, как второй по перспективности

In [52]:
param_grid = {

    'k': [29, 40, 60, 80, 100],
    #'min_k': [10, 20, 30],
    'sim_options': {
        'name': ['pearson', 'cosine','msd'],
#        'min_support': [1, 3, 5, 8, 13],
        'user_based': [False, True],
    },
}

In [53]:
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=3, joblib_verbose=3)
gs.fit(data_100K)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Computing the pearson similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


Computing the pearson similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  2.8min finished


In [54]:
gs.best_score

{'rmse': 0.9417770157444648}

In [25]:
gs.best_score

{'rmse': 0.9355435518329102}

In [21]:
gs.best_params

{'rmse': {'bsl_options': {'method': 'sgd'},
  'k': 89,
  'sim_options': {'name': 'pearson', 'user_based': False}}}