## ПАКЕТ SURPRISE
 * используйте данные MovieLens 1M

 * можно использовать любые модели из пакета
 * получите RMSE на тестовом сете 0.87 и ниже
 Комментарий преподавателя:
 * В ДЗ на датасет 1М может не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете.

In [13]:
from surprise import KNNWithMeans, KNNBasic, SVD, SVDpp, NMF #методы ближ соседей
from surprise import Dataset # модуль для обработки данных спец способом (без этого surprise не работает)
from surprise import accuracy #модуль с метриками, не только accuracy
from surprise import Reader # работает с Dataset
# from surprise.model_selection import train_test_split
import pandas as pd
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
# Делаю на 100К
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
# преобразовываем датасет для использования пакетом surprise
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [7]:
min_ = dataset.rating.min()
max_ = dataset.rating.max()
print(f'Минимальное значение рейтинга: {min_}')
print(f'Максимальное значение рейтинга: {max_}')

Минимальное значение рейтинга: 0.5
Максимальное значение рейтинга: 5.0


In [8]:
# trainset, testset = train_test_split(dataset, test_size=.15) #делим данные на обучаемые и тестовые
# # делю в sklearn, потом уже прогоняю через reader в surprise

In [9]:
reader = Reader(rating_scale=(min_, max_)) #задаем границы рейтинга
dataset = Dataset.load_from_df(dataset, reader)
# train = Dataset.load_from_df(trainset, reader)
# test = Dataset.load_from_df(testset, reader)

In [17]:
# KNNBasic - подберем параметры
param_grid_knn_basic = {'k': [10, 30, 50],
              'sim_options': {'name': ['msd', 'cosine','pearson']}
              }
gs_knn_basic = GridSearchCV(KNNBasic, param_grid_knn_basic, measures=['rmse'], cv=5)
gs_knn_basic.fit(dataset)

# лучший RMSE
print(gs_knn_basic.best_score['rmse'])

# комбинация параметров, которая дает лучший RMSE
print(gs_knn_basic.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson si

In [18]:
algo_knn_basic = gs_knn_basic.best_estimator['rmse']
# посчитаем RMSE при помощи cross_validate
cross_validate(algo_knn_basic, dataset, measures = ['RMSE'], cv = 5, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9499  0.9373  0.9354  0.9452  0.9439  0.9423  0.0053  
Fit time          0.23    0.28    0.26    0.33    0.32    0.29    0.04    
Test time         1.53    1.47    1.38    2.06    1.47    1.58    0.24    


{'test_rmse': array([0.94989658, 0.93727168, 0.9354111 , 0.94521787, 0.94388124]),
 'fit_time': (0.23436236381530762,
  0.2812917232513428,
  0.25920820236206055,
  0.33197617530822754,
  0.31854987144470215),
 'test_time': (1.5296697616577148,
  1.470170497894287,
  1.3818585872650146,
  2.055899143218994,
  1.4710314273834229)}

In [19]:
# KNNWithMeans - подберем параметры
param_knn_means = {'k': [10, 30, 50],
              'sim_options': {'name': ['msd', 'cosine','pearson']}
              }
gs_knn_means = GridSearchCV(KNNWithMeans, param_knn_means, measures=['rmse'], cv=5)
gs_knn_means.fit(dataset)

# лучший RMSE
print(gs_knn_means.best_score['rmse'])

# комбинация параметров, которая дает лучший RMSE
print(gs_knn_means.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson si

In [21]:
algo_knn_means = gs_knn_means.best_estimator['rmse']
# посчитаем RMSE при помощи cross_validate
cross_validate(algo_knn_means, dataset, measures = ['RMSE'], cv = 5, verbose = True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8897  0.9071  0.8952  0.8915  0.8978  0.8963  0.0061  
Fit time          1.04    1.04    1.02    1.02    1.23    1.07    0.08    
Test time         2.18    2.00    2.03    2.22    2.26    2.14    0.10    


{'test_rmse': array([0.88969819, 0.90705957, 0.89516334, 0.89154021, 0.89784739]),
 'fit_time': (1.0446743965148926,
  1.0418808460235596,
  1.0224239826202393,
  1.015566110610962,
  1.2313220500946045),
 'test_time': (2.1831510066986084,
  2.004120111465454,
  2.028127670288086,
  2.2166616916656494,
  2.2628538608551025)}

In [10]:
# SVD - подберем параметры

param_grid_svd = {'n_factors': [100, 200], 'n_epochs': [50, 100], 'lr_all': [0.005, 0.01],
              'reg_all': [0.2,0.4,0.6]}
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=5, n_jobs=-1)
gs_svd.fit(dataset)

# лучший RMSE
print(gs_svd.best_score['rmse'])

# комбинация параметров, которая дает лучший RMSE
print(gs_svd.best_params['rmse'])

0.8622160485299792
{'n_factors': 200, 'n_epochs': 100, 'lr_all': 0.01, 'reg_all': 0.2}


In [11]:
algo_svd = gs_svd.best_estimator['rmse']
cross_validate(algo_svd, dataset, measures = ['RMSE'], cv = 5, verbose = True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8690  0.8564  0.8695  0.8657  0.8566  0.8634  0.0058  
Fit time          52.92   51.43   51.91   52.27   53.78   52.46   0.82    
Test time         0.28    0.20    0.16    0.20    0.20    0.21    0.04    


{'test_rmse': array([0.86895913, 0.85643331, 0.86949165, 0.86567364, 0.85660721]),
 'fit_time': (52.91525745391846,
  51.426679849624634,
  51.91125512123108,
  52.26606464385986,
  53.78134560585022),
 'test_time': (0.284271240234375,
  0.20313048362731934,
  0.1562497615814209,
  0.20207881927490234,
  0.20411062240600586)}

In [16]:
# # SVDpp

# param_grid_pp = {'n_factors': [200], 'n_epochs': [100], 'lr_all': [0.01],
#               'reg_all': [0.2,0.4]}
# gs3 = GridSearchCV(SVDpp, param_grid_pp, measures=['rmse'], cv=3, n_jobs=-1)
# gs3.fit(dataset)

# # лучший RMSE
# print(gs3.best_score['rmse'])

# # комбинация параметров, которая дает лучший RMSE
# print(gs3.best_params['rmse'])
# уж очень долго считает...

In [15]:
# svdpp = SVDpp()
# # evaluate(svdpp, dataset, measures = ['RMSE'])
# cross_validate(svdpp, dataset, measures = ['RMSE'], cv = 5, verbose = True)

In [15]:
# NMF - подберем параметры

param_grid_nmf = {'n_factors': [100, 200], 'n_epochs': [50, 100]}
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=5, n_jobs=-1)
gs_nmf.fit(dataset)

# лучший RMSE
print(gs_nmf.best_score['rmse'])

# комбинация параметров, которая дает лучший RMSE
print(gs_nmf.best_params['rmse'])

0.883271062667817
{'n_factors': 200, 'n_epochs': 100}


In [16]:
algo_nmf = gs_nmf.best_estimator['rmse']
cross_validate(algo_nmf, dataset, measures = ['RMSE'], cv = 5, verbose = True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8781  0.8876  0.8823  0.8831  0.8799  0.8822  0.0032  
Fit time          90.94   89.01   85.51   92.31   93.47   90.25   2.80    
Test time         0.22    0.14    0.39    0.18    0.28    0.24    0.09    


{'test_rmse': array([0.87810173, 0.88758265, 0.8823277 , 0.88310833, 0.87986445]),
 'fit_time': (90.9371747970581,
  89.01069712638855,
  85.51044988632202,
  92.30630373954773,
  93.47210431098938),
 'test_time': (0.21549272537231445,
  0.14063024520874023,
  0.3925766944885254,
  0.17702770233154297,
  0.27512645721435547)}

In [None]:
# Самый лучший RMSE на тесте (0.8634) дал SVD c параметрами {'n_factors': 200, 'n_epochs': 100, 'lr_all': 0.01, 'reg_all': 0.2}