In [None]:
#!pip install scikit-surprise

documentation of the Surprise package can be found at http://surprise.readthedocs.io/en/stable/index.html

In [None]:
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
from surprise import KNNBasic

In [3]:
import os
data_file_path = "ratings.dat"
data_file_path

'ratings.dat'

In [4]:
# As we're loading a custom dataset, we need to define a reader. In the
# course talk data set that we are using, each line has the following format:
# 'user item rating', separated by '|' characters.

"""
The Reader class is used to parse a file containing ratings.

Such a file is assumed to specify only one rating per line, and each line needs to respect the following structure:

user ; item ; rating ; [timestamp]

Here we dont have time stamp and it is optional

"""
reader_object = Reader(line_format='user item rating', sep='|',rating_scale=(1, 5), skip_lines=0)
data = Dataset.load_from_file(data_file_path, reader=reader_object)

In [5]:
# We need to split the Data into five folds to perform cross validation
data.split(n_folds=5)

In [6]:
# We'll use the famous User Based Collaborative Filtering algorithm.there are a list of other algorithms that can be used in the Surprise package
similarity_params = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }
algo = KNNBasic(sim_options=similarity_params)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------

  sim = construction_func[name](*args)



Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9741
MAE:  0.6772
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9411
MAE:  0.6489
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9512
MAE:  0.6702
------------
Fold 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9592
MAE:  0.6516
------------
Fold 5
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0105
MAE:  0.6738
------------
------------
Mean RMSE: 0.9672
Mean MAE : 0.6643
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
MAE     0.6772  0.6489  0.6702  0.6516  0.6738  0.6643  
RMSE    0.9741  0.9411  0.9512  0.9592  1.0105  0.9672  


# Grid Search

In the context of machine learning, hyperparameters are parameters whose values are set prior to the commencement of the learning process. By contrast, the value of other parameters is derived via training.Hyperparameter optimization or model selection is the problem of choosing a set of optimal hyperparameters for a learning algorithm, for optimizing a measure of the algorithm's performance on a data set.

The traditional way of performing hyperparameter optimization has been grid search, or a parameter sweep, which is simply an exhaustive searching through a manually specified subset of the hyperparameter space of a learning algorithm. A grid search algorithm must be guided by some performance metric, typically measured by cross-validation on the training set or evaluation on a held-out validation set.
Grid search suffers from the curse of high dimentionality. Since it has to search for every possible combination of specified hyper parameter. SO grid serach must be used wisely

In [7]:
from surprise import GridSearch
param_grid = {'min_k': [1,2,3,4,5], 'k': [35,36,37,38,39,40]}
grid_search = GridSearch(KNNBasic, param_grid, measures=['RMSE', 'MAE'])
grid_search.evaluate(data)

------------
Parameters combination 1 of 30
params:  {'min_k': 1, 'k': 35}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9656
Mean FCP : 0.5363
------------
------------
Parameters combination 2 of 30
params:  {'min_k': 1, 'k': 36}
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9656
Mean FCP : 0.5363
---------

In [8]:
# To know results of grid search

# best RMSE score in the model
print("Best RMSE Score is:{!r}".format(grid_search.best_score['RMSE']))


# combination of parameters that gave the best RMSE score
print("Parameter to achieve Best RMSE Score is:{!r}".format(grid_search.best_params['RMSE']))


# best MAE score
print("Best MAE Score is :{!r}".format(grid_search.best_score['MAE']))


# combination of parameters that gave the best MAE score
print("Parameter to achieve Best MAE Score is:{!r}".format(grid_search.best_params['MAE']))


Best RMSE Score is:0.94089109969450635
Parameter to achieve Best RMSE Score is:{'min_k': 3, 'k': 35}
Best FCP Score is :0.53630313019054188
Parameter to achieve Best RMSE Score is:{'min_k': 1, 'k': 35}


In [9]:
#converting all results of grid search into a pandas table for better understanding
import pandas as pd
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
results_df.head()

Unnamed: 0,FCP,RMSE,k,min_k,params,scores
0,0.536303,0.965589,35,1,"{'min_k': 1, 'k': 35}","{'RMSE': 0.965589004571, 'FCP': 0.536303130191}"
1,0.536303,0.965589,36,1,"{'min_k': 1, 'k': 36}","{'RMSE': 0.965589004571, 'FCP': 0.536303130191}"
2,0.536303,0.965589,37,1,"{'min_k': 1, 'k': 37}","{'RMSE': 0.965589004571, 'FCP': 0.536303130191}"
3,0.536303,0.965589,38,1,"{'min_k': 1, 'k': 38}","{'RMSE': 0.965589004571, 'FCP': 0.536303130191}"
4,0.536303,0.965589,39,1,"{'min_k': 1, 'k': 39}","{'RMSE': 0.965589004571, 'FCP': 0.536303130191}"
