## Задание теме «Коллаборативная фильтрация»
### Никифоров Владимир

-  ПАКЕТ SURPRISE

-  используйте данные MovieLens 1M
-  можно использовать любые модели из пакета
-  получите RMSE на тестовом сете 0.87 и ниже
-  В ДЗ на датасет 1М можешь не хватить RAM. Можно сделать на 100K. Качество RMSE предлагаю считать на основе CrossValidation (5 фолдов), а не отложенном датасете

In [38]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

from surprise import Dataset, Reader, KNNBasic, KNNWithMeans, SVD, SVDpp
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import accuracy
from surprise.model_selection import KFold

In [2]:
N_FOLDS = 5
RANDOM_STATE = 777

In [3]:
kfold = KFold(5)

In [19]:
df_ratings = pd.read_csv('../data/ml-latest-small/ratings.csv', usecols=['uid', 'iid', 'rating'], names=['uid', 'iid', 'rating', 'to_drop'], header=1)

In [20]:
print(df_ratings.describe())
df_ratings.head()

                 uid            iid         rating
count  100835.000000  100835.000000  100835.000000
mean      326.130788   19435.488451       3.501552
std       182.616527   35531.110673       1.042533
min         1.000000       1.000000       0.500000
25%       177.000000    1199.000000       3.000000
50%       325.000000    2991.000000       3.500000
75%       477.000000    8123.000000       4.000000
max       610.000000  193609.000000       5.000000


Unnamed: 0,uid,iid,rating
0,1,3,4.0
1,1,6,4.0
2,1,47,5.0
3,1,50,5.0
4,1,70,3.0


In [21]:
reader = Reader(rating_scale=(0.5, 5))

In [23]:
dataset = Dataset.load_from_df(df_ratings, reader)

### Baseline с отложенной выборкой

In [24]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [30]:
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
predictions = algo.test(testset)
print(f'RMSE on TEST 20% = {accuracy.rmse(predictions)}')

### CV using 5 folds

In [34]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))
print(f'Mean RMSE = {np.mean(scores)}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8727
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8904
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8895
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8852
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8760

Mean RMSE = 0.882760669251145


### Test other algos

In [37]:
algo = SVD()
cross_validate(algo, dataset, measures=['RMSE'], cv=N_FOLDS, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8734  0.8757  0.8749  0.8691  0.8754  0.8737  0.0024  
Fit time          4.34    4.17    4.12    4.25    4.22    4.22    0.07    
Test time         0.15    0.13    0.20    0.12    0.13    0.15    0.03    


{'test_rmse': array([0.87344172, 0.8757409 , 0.87492602, 0.86910094, 0.87542935]),
 'fit_time': (4.335114240646362,
  4.171403408050537,
  4.119460582733154,
  4.251147270202637,
  4.2204506397247314),
 'test_time': (0.1529397964477539,
  0.13019728660583496,
  0.19591808319091797,
  0.12461447715759277,
  0.1292252540588379)}

In [49]:
algo = SVDpp()
cross_validate(algo, dataset, measures=['RMSE'], cv=N_FOLDS, verbose=True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8519  0.8614  0.8641  0.8620  0.8634  0.8605  0.0044  
Fit time          947.04  940.65  915.78  923.27  922.76  929.90  11.86   
Test time         16.30   16.07   16.99   15.97   15.85   16.24   0.41    


{'test_rmse': array([0.85188394, 0.86136704, 0.86410083, 0.86197464, 0.86339344]),
 'fit_time': (947.0383911132812,
  940.6507360935211,
  915.7782490253448,
  923.2658824920654,
  922.7639925479889),
 'test_time': (16.3007333278656,
  16.072584629058838,
  16.994832515716553,
  15.966946363449097,
  15.852218866348267)}

In [50]:
param_grid = {'n_factors': [50, 100, 200], 'n_epochs': [50, 100, 200], 'lr_all': [0.002, 0.005],
              'biased': [True, False], 'reg_all': [0.02, 0.1, 0.2, 0.4], 'random_state': [RANDOM_STATE], 'verbose': [True]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=N_FOLDS, n_jobs=-1, joblib_verbose=True)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 20.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 71.3min


0.8488435089125215
{'n_factors': 200, 'n_epochs': 200, 'lr_all': 0.005, 'biased': True, 'reg_all': 0.1, 'random_state': 777, 'verbose': True}


[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 177.6min finished
