In [3]:
import pandas as pd
import pickle
import random
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader, GridSearch, accuracy, dump
from surprise import evaluate, print_perf
from sklearn.model_selection import train_test_split
import time

In [4]:
data = pd.read_csv('./ml-100k/data.csv')
df = pd.DataFrame(data)
df.drop('timestamp', axis=1, inplace=True)
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
raw_ratings = dataset.raw_ratings
random.shuffle(raw_ratings)
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
dataset.raw_ratings = A_raw_ratings
dataset.split(n_folds=5)

In [3]:
latent_factors = [30,40,50,60,70,80,90,100,110,120]
regularizations = [0.5,0.2,0.1,0.05,0.02,0.01]
start_time = int(time.time())
param_grid = {'n_factors': latent_factors, 'n_epochs': [100], 'biased': [False], 'reg_all': regularizations}
grid_search = GridSearch(pa.matrix_factorization.SVD, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
grid_search.evaluate(dataset)
end_time = int(time.time())
print (start_time-end_time)/(len(latent_factors)*len(regularizations))
pickle.dump(grid_search.cv_results, open("svd_result1","wb"))

[{'reg_all': 0.5, 'n_factors': 30, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 40, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 50, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 60, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 70, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 80, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 90, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 100, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 110, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.5, 'n_factors': 120, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.2, 'n_factors': 30, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.2, 'n_factors': 40, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.2, 'n_factors': 50, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.2, 'n_factors': 60, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.2, 'n_factors': 

------------
Mean MAE : 0.7364
Mean RMSE: 0.9297
Mean FCP : 0.7137
------------
------------
Parameters combination 22 of 60
params:  {'reg_all': 0.1, 'n_factors': 40, 'biased': False, 'n_epochs': 100}
------------
Mean MAE : 0.7385
Mean RMSE: 0.9322
Mean FCP : 0.7139
------------
------------
Parameters combination 23 of 60
params:  {'reg_all': 0.1, 'n_factors': 50, 'biased': False, 'n_epochs': 100}
------------
Mean MAE : 0.7377
Mean RMSE: 0.9305
Mean FCP : 0.7151
------------
------------
Parameters combination 24 of 60
params:  {'reg_all': 0.1, 'n_factors': 60, 'biased': False, 'n_epochs': 100}
------------
Mean MAE : 0.7374
Mean RMSE: 0.9306
Mean FCP : 0.7163
------------
------------
Parameters combination 25 of 60
params:  {'reg_all': 0.1, 'n_factors': 70, 'biased': False, 'n_epochs': 100}
------------
Mean MAE : 0.7382
Mean RMSE: 0.9308
Mean FCP : 0.7158
------------
------------
Parameters combination 26 of 60
params:  {'reg_all': 0.1, 'n_factors': 80, 'biased': False, 'n_epoc

In [4]:
best_score = grid_search.best_score["RMSE"]
best_params = grid_search.best_params["RMSE"]
print best_score,best_params

0.929726850058 {'reg_all': 0.1, 'n_factors': 30, 'biased': False, 'n_epochs': 100}


the best solutin is obtained to regularization of 0.1 and for finding the optimal number of epochs and latent_factor vary number of factors from 20 to 100 in length of 10 and number of epoch fr

In [5]:
latent_factors = [20,30,40,50,60,70,80,90,100]
epochs = [20,50,100]
start_time = int(time.time())
param_grid = {'n_factors': latent_factors, 'n_epochs': epochs, 'biased': [False], 'reg_all': [0.1]}
grid_search = GridSearch(pa.matrix_factorization.SVD, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
grid_search.evaluate(dataset)
end_time = int(time.time())
print (start_time-end_time)/(len(latent_factors)*len(epochs))
pickle.dump(grid_search.cv_results, open("svd_result2","wb"))

[{'reg_all': 0.1, 'n_factors': 20, 'biased': False, 'n_epochs': 20}, {'reg_all': 0.1, 'n_factors': 20, 'biased': False, 'n_epochs': 50}, {'reg_all': 0.1, 'n_factors': 20, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.1, 'n_factors': 30, 'biased': False, 'n_epochs': 20}, {'reg_all': 0.1, 'n_factors': 30, 'biased': False, 'n_epochs': 50}, {'reg_all': 0.1, 'n_factors': 30, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.1, 'n_factors': 40, 'biased': False, 'n_epochs': 20}, {'reg_all': 0.1, 'n_factors': 40, 'biased': False, 'n_epochs': 50}, {'reg_all': 0.1, 'n_factors': 40, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.1, 'n_factors': 50, 'biased': False, 'n_epochs': 20}, {'reg_all': 0.1, 'n_factors': 50, 'biased': False, 'n_epochs': 50}, {'reg_all': 0.1, 'n_factors': 50, 'biased': False, 'n_epochs': 100}, {'reg_all': 0.1, 'n_factors': 60, 'biased': False, 'n_epochs': 20}, {'reg_all': 0.1, 'n_factors': 60, 'biased': False, 'n_epochs': 50}, {'reg_all': 0.1, 'n_factors': 60, 'biased':

In [6]:
best_score = grid_search.best_score["RMSE"]
best_params = grid_search.best_params["RMSE"]
print best_score,best_params

0.930927648183 {'reg_all': 0.1, 'n_factors': 60, 'biased': False, 'n_epochs': 100}


the improvement in rmse from number of epoch = 50 to 100 is small . so take the optimal epochs as 50. and the number of factors between 40-80

In [5]:
latent_factors = [40,50,60,70,80]
regularizations = [0.5,0.2,0.1,0.05,0.02]
learning_rates = [0.02,0.01,0.005,0.002]
start_time = int(time.time())
param_grid = {'n_factors': latent_factors, 'n_epochs': [50], 'reg_all': regularizations, 'lr_all': learning_rates}
grid_search = GridSearch(pa.matrix_factorization.SVD, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
grid_search.evaluate(dataset)
end_time = int(time.time())
print (start_time-end_time)/(len(latent_factors)*len(regularizations))
pickle.dump(grid_search.cv_results, open("svd_result3","wb"))

[{'lr_all': 0.02, 'reg_all': 0.5, 'n_factors': 40, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.5, 'n_factors': 50, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.5, 'n_factors': 60, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.5, 'n_factors': 70, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.5, 'n_factors': 80, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 40, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 50, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 60, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 70, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 80, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 40, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 50, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 60, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 70, 'n_epochs': 50}, {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 80, 'n_epochs':

------------
Mean MAE : 0.7457
Mean RMSE: 0.9369
Mean FCP : 0.7010
------------
------------
Parameters combination 9 of 100
params:  {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 70, 'n_epochs': 50}
------------
Mean MAE : 0.7456
Mean RMSE: 0.9368
Mean FCP : 0.7010
------------
------------
Parameters combination 10 of 100
params:  {'lr_all': 0.02, 'reg_all': 0.2, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.7455
Mean RMSE: 0.9366
Mean FCP : 0.7009
------------
------------
Parameters combination 11 of 100
params:  {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 40, 'n_epochs': 50}
------------
Mean MAE : 0.7338
Mean RMSE: 0.9297
Mean FCP : 0.7058
------------
------------
Parameters combination 12 of 100
params:  {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 50, 'n_epochs': 50}
------------
Mean MAE : 0.7332
Mean RMSE: 0.9283
Mean FCP : 0.7076
------------
------------
Parameters combination 13 of 100
params:  {'lr_all': 0.02, 'reg_all': 0.1, 'n_factors': 60, 'n_epochs': 

------------
Mean MAE : 0.7885
Mean RMSE: 1.0076
Mean FCP : 0.6607
------------
------------
Parameters combination 50 of 100
params:  {'lr_all': 0.01, 'reg_all': 0.02, 'n_factors': 80, 'n_epochs': 50}
------------
Mean MAE : 0.7871
Mean RMSE: 1.0028
Mean FCP : 0.6622
------------
------------
Parameters combination 51 of 100
params:  {'lr_all': 0.005, 'reg_all': 0.5, 'n_factors': 40, 'n_epochs': 50}
------------
Mean MAE : 0.7729
Mean RMSE: 0.9623
Mean FCP : 0.6905
------------
------------
Parameters combination 52 of 100
params:  {'lr_all': 0.005, 'reg_all': 0.5, 'n_factors': 50, 'n_epochs': 50}
------------
Mean MAE : 0.7729
Mean RMSE: 0.9623
Mean FCP : 0.6905
------------
------------
Parameters combination 53 of 100
params:  {'lr_all': 0.005, 'reg_all': 0.5, 'n_factors': 60, 'n_epochs': 50}
------------
Mean MAE : 0.7729
Mean RMSE: 0.9623
Mean FCP : 0.6905
------------
------------
Parameters combination 54 of 100
params:  {'lr_all': 0.005, 'reg_all': 0.5, 'n_factors': 70, 'n_epo

------------
Mean MAE : 0.7497
Mean RMSE: 0.9447
Mean FCP : 0.6919
------------
------------
Parameters combination 91 of 100
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 40, 'n_epochs': 50}
------------
Mean MAE : 0.7458
Mean RMSE: 0.9421
Mean FCP : 0.6910
------------
------------
Parameters combination 92 of 100
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 50, 'n_epochs': 50}
------------
Mean MAE : 0.7459
Mean RMSE: 0.9423
Mean FCP : 0.6921
------------
------------
Parameters combination 93 of 100
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 60, 'n_epochs': 50}
------------
Mean MAE : 0.7465
Mean RMSE: 0.9430
Mean FCP : 0.6908
------------
------------
Parameters combination 94 of 100
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 70, 'n_epochs': 50}
------------
Mean MAE : 0.7459
Mean RMSE: 0.9422
Mean FCP : 0.6917
------------
------------
Parameters combination 95 of 100
params:  {'lr_all': 0.002, 'reg_all': 0.05, 'n_factors': 80, '

best RMSE is obtained for a learning rate of 0.001 with regularization 0.1 and number of factors = 80
or with the parameters learning rate = 0.005 and regularization = 0.05 and number of factors = 70 

In [13]:
start_time = int(time.time())
trainset = dataset.build_full_trainset()
algo = pa.matrix_factorization.SVD(n_factors=80, n_epochs=50, biased=True, reg_all=0.1, lr_all=0.001)
algo.train(trainset)
end_time = int(time.time())
print (start_time-end_time)
testset = dataset.construct_testset(B_raw_ratings)
predictions = algo.test(testset)
print 'Unbiased accuracy on B,', accuracy.rmse(predictions)
accuracy.rmse(predictions)
accuracy.mae(predictions)
accuracy.fcp(predictions)
dump.dump('./svd_algo',predictions,algo)

-6
Unbiased accuracy on B,RMSE: 0.9494
 0.949366168968
RMSE: 0.9494
MAE:  0.7508
FCP:  0.6979
