In [1]:
from surprise import SVD, evaluate
from surprise import GridSearch
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline
from surprise import Dataset
from tqdm import tqdm_notebook
from surprise import Reader
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv')
tags = pd.read_csv('tag.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [6]:
movies_with_ratings = movies[:1000000].join(ratings[:1000000].set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [8]:
ratings.rating.min()

0.5

In [9]:
ratings.rating.max()

5.0

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [11]:
opt = {'name': 'pearson_baseline', 'user_based': True}
algorithms = [KNNWithMeans(k = 50, sim_options = opt), KNNBasic(k = 50, sim_options = opt),
              KNNWithZScore(k = 50, sim_options = opt), KNNBaseline(k = 50, sim_options = opt)]

In [12]:
data.split(n_folds=5)

In [13]:
accuracy_algo = []
sim_opt = {'name': ['cosine', 'pearson_baseline'],
          'user_based': [True, False]}
for algo in tqdm_notebook(algorithms):
    evaluate(algo, data, measures=['RMSE'])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))



Evaluating RMSE of algorithm KNNWithMeans.





------------
Fold 1
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8436
------------
Fold 2
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8481
------------
Fold 3
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8464
------------
Fold 4
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8475
------------
Fold 5
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8456
------------
------------
Mean RMSE: 0.8462
------------
------------
Evaluating RMSE of algorithm KNNBasic.

------------
Fold 1
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMS

In [14]:
algo = SVD()
evaluate(algo, data, measures=['RMSE'])

Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.8320
------------
Fold 2
RMSE: 0.8360
------------
Fold 3
RMSE: 0.8354
------------
Fold 4
RMSE: 0.8343
------------
Fold 5
RMSE: 0.8345
------------
------------
Mean RMSE: 0.8345
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.832023742848042,
                             0.8359991330609238,
                             0.8354344224912943,
                             0.8342629978345586,
                             0.8345387360525557]})

In [15]:
param_grid = {'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'])
grid_search.evaluate(data)



Running grid search for the following parameter combinations:
{'lr_all': 0.002, 'reg_all': 0.4}
{'lr_all': 0.002, 'reg_all': 0.6}
{'lr_all': 0.005, 'reg_all': 0.4}
{'lr_all': 0.005, 'reg_all': 0.6}
Resulsts:
{'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.8837612263308479}
----------
{'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 0.8947060014284144}
----------
{'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.8806443638167639}
----------
{'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.8921051279863266}
----------
