In [1]:
import pandas as pd
import pickle
import random
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader, GridSearch, accuracy, dump
from surprise import evaluate, print_perf
from sklearn.model_selection import train_test_split
import time

Split the data into two part, A_raw_ratings for model training and tuning. B_raw_ratings for unbiased testing

In [2]:
data = pd.read_csv('./ml-100k/data.csv')
df = pd.DataFrame(data)
df.drop('timestamp', axis=1, inplace=True)
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
raw_ratings = dataset.raw_ratings
random.shuffle(raw_ratings)
threshold = int(.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
dataset.raw_ratings = A_raw_ratings
dataset.split(n_folds=5)

In [3]:
res_tune = {}
latent_factors = [5,10,15,20,25,30]
regularizations = [0.5,0.2,0.1,0.05,0.02,0.01,0.005]
for regularization in regularizations:
    print "regularization : ", regularization
    start_time = int(time.time())
    param_grid = {'n_factors': latent_factors, 'n_epochs': [100], 'reg_pu': [regularization], 'reg_qi': [regularization]}
    grid_search = GridSearch(pa.matrix_factorization.NMF, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
    grid_search.evaluate(dataset)
    end_time = int(time.time())
    res_tune[regularization] = grid_search.cv_results
    print (start_time-end_time)/len(latent_factors)
pickle.dump(res_tune, open("nmf_result1","wb"))

regularization :  0.5
[{'n_factors': 5, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}, {'n_factors': 10, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}, {'n_factors': 15, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}, {'n_factors': 20, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}, {'n_factors': 25, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}, {'n_factors': 30, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}]
------------
Parameters combination 1 of 6
params:  {'n_factors': 5, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}
------------
Mean MAE : 0.8866
Mean RMSE: 1.0731
Mean FCP : 0.6880
------------
------------
Parameters combination 2 of 6
params:  {'n_factors': 10, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}
------------
Mean MAE : 0.8866
Mean RMSE: 1.0731
Mean FCP : 0.6880
------------
------------
Parameters combination 3 of 6
params:  {'n_factors': 15, 'reg_qi': 0.5, 'reg_pu': 0.5, 'n_epochs': 100}
------------
Mean MAE : 0.8866
Mean RMSE: 1.0731
Mean FCP : 0.6880
-----

vary the number of factors and regularization for the user and item vectors
the best solutions are obtained for number of factors close to 20 and with regularization of 0.1
we choose number of factors as 20, to not overfit the model

In [4]:
for reg in res_tune:
    c = 0
    print "regularization : ", reg 
    s = ""
    for i in res_tune[reg]['RMSE']:
        s+=str(latent_factors[c])+"||"+str(i)+"   "
        c+=1
    print s

regularization :  0.5
5||1.07308604174   10||1.07309135058   15||1.07309299279   20||1.07309222039   25||1.07309052124   30||1.07309095147   
regularization :  0.005
5||1.91250738147   10||1.18932411735   15||1.09713356077   20||1.32522968925   25||1.49567172442   30||1.59619051221   
regularization :  0.1
5||0.955835794036   10||0.949769414646   15||0.950549178723   20||0.947425118203   25||0.947188897764   30||0.943521413139   
regularization :  0.05
5||0.983817796788   10||0.98361691683   15||0.985018167585   20||0.981587486482   25||0.97904367206   30||0.976582431465   
regularization :  0.2
5||0.958866705685   10||0.957038052954   15||0.955767971055   20||0.955937563144   25||0.955648293572   30||0.954529576677   
regularization :  0.01
5||1.59664307885   10||1.10287826086   15||1.06731450563   20||1.18968878497   25||1.31206178628   30||1.39850305788   
regularization :  0.02
5||1.22060010699   10||1.03780494421   15||1.03710508609   20||1.06478667118   25||1.10142893328   30||1.

In [5]:
res_tune = {}
latent_factors = [15,20,25]
regularizations = [0.2,0.1,0.05,0.02]
learn_bias = [0.02,0.01,0.005,0.002,0.001]
for regularization in regularizations:
    res_tune[regularization] = {}
    for lb in learn_bias:
        print "regularization : ", regularization, " bias : ", lb
        start_time = int(time.time())
        param_grid = {'n_factors': latent_factors, 'n_epochs': [100], 'reg_pu': [regularization], 'reg_qi': [regularization], 
                      'biased': [True], 'lr_bu': [lb], 'lr_bi': [lb], 'reg_bu': [regularization], 'reg_bi': [regularization]}
        grid_search = GridSearch(pa.matrix_factorization.NMF, param_grid=param_grid, measures=['MAE', 'RMSE', 'FCP'])
        grid_search.evaluate(dataset)
        end_time = int(time.time())
        print (start_time-end_time)
        res_tune[regularization][lb] = grid_search.cv_results
pickle.dump(res_tune, open("nmf_result2","wb"))

regularization :  0.2  bias :  0.02
[{'biased': True, 'reg_bi': 0.2, 'reg_pu': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_qi': 0.2, 'n_factors': 15, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.2, 'reg_pu': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_qi': 0.2, 'n_factors': 20, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.2, 'reg_pu': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_qi': 0.2, 'n_factors': 25, 'n_epochs': 100}]
------------
Parameters combination 1 of 3
params:  {'biased': True, 'reg_bi': 0.2, 'reg_pu': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_qi': 0.2, 'n_factors': 15, 'n_epochs': 100}
------------
Mean MAE : 0.7939
Mean RMSE: 1.0004
Mean FCP : 0.6808
------------
------------
Parameters combination 2 of 3
params:  {'biased': True, 'reg_bi': 0.2, 'reg_pu': 0.2, 'lr_bu': 0.02, 'lr_bi': 0.02, 'reg_bu': 0.2, 'reg_qi': 0.2, 'n_factors': 20, 'n_epochs': 100}
------------
Mean MAE : 0.7916
Mean RMSE: 1.0049
Mean FCP : 0.6746
----

------------
Mean MAE : 0.7435
Mean RMSE: 0.9399
Mean FCP : 0.6946
------------
------------
Parameters combination 2 of 3
params:  {'biased': True, 'reg_bi': 0.1, 'reg_pu': 0.1, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.1, 'reg_qi': 0.1, 'n_factors': 20, 'n_epochs': 100}
------------
Mean MAE : 0.7706
Mean RMSE: 0.9829
Mean FCP : 0.6788
------------
------------
Parameters combination 3 of 3
params:  {'biased': True, 'reg_bi': 0.1, 'reg_pu': 0.1, 'lr_bu': 0.01, 'lr_bi': 0.01, 'reg_bu': 0.1, 'reg_qi': 0.1, 'n_factors': 25, 'n_epochs': 100}
------------
Mean MAE : 0.8020
Mean RMSE: 1.0285
Mean FCP : 0.6758
------------
-36
regularization :  0.1  bias :  0.005
[{'biased': True, 'reg_bi': 0.1, 'reg_pu': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_qi': 0.1, 'n_factors': 15, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.1, 'reg_pu': 0.1, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.1, 'reg_qi': 0.1, 'n_factors': 20, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.1, 'reg_pu': 0.

------------
Mean MAE : 1.2173
Mean RMSE: 1.5279
Mean FCP : 0.5556
------------
-35
regularization :  0.05  bias :  0.002
[{'biased': True, 'reg_bi': 0.05, 'reg_pu': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_qi': 0.05, 'n_factors': 15, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.05, 'reg_pu': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_qi': 0.05, 'n_factors': 20, 'n_epochs': 100}, {'biased': True, 'reg_bi': 0.05, 'reg_pu': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_qi': 0.05, 'n_factors': 25, 'n_epochs': 100}]
------------
Parameters combination 1 of 3
params:  {'biased': True, 'reg_bi': 0.05, 'reg_pu': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_qi': 0.05, 'n_factors': 15, 'n_epochs': 100}
------------
Mean MAE : 0.7397
Mean RMSE: 0.9413
Mean FCP : 0.6938
------------
------------
Parameters combination 2 of 3
params:  {'biased': True, 'reg_bi': 0.05, 'reg_pu': 0.05, 'lr_bu': 0.002, 'lr_bi': 0.002, 'reg_bu': 0.05, 'reg_

------------
Mean MAE : 0.7632
Mean RMSE: 0.9794
Mean FCP : 0.6736
------------
------------
Parameters combination 2 of 3
params:  {'biased': True, 'reg_bi': 0.02, 'reg_pu': 0.02, 'lr_bu': 0.001, 'lr_bi': 0.001, 'reg_bu': 0.02, 'reg_qi': 0.02, 'n_factors': 20, 'n_epochs': 100}
------------
Mean MAE : 0.7666
Mean RMSE: 0.9835
Mean FCP : 0.6739
------------
------------
Parameters combination 3 of 3
params:  {'biased': True, 'reg_bi': 0.02, 'reg_pu': 0.02, 'lr_bu': 0.001, 'lr_bi': 0.001, 'reg_bu': 0.02, 'reg_qi': 0.02, 'n_factors': 25, 'n_epochs': 100}
------------
Mean MAE : 0.7696
Mean RMSE: 0.9886
Mean FCP : 0.6719
------------
-36


for a biased version vary the number of factors and regularization close in a range close to optimal solution and vary the learning rate for the bias

In [6]:
for reg in res_tune:
    for bias in res_tune[reg]:
        c = 0
        print "regularization : ", reg, " bias : ", bias 
        s = ""
        for i in res_tune[reg][bias]['RMSE']:
            s+=str(latent_factors[c])+"||"+str(i)+"   "
            c+=1
        print s

regularization :  0.05  bias :  0.001
15||0.940895079056   20||0.941689980199   25||0.939116823657   
regularization :  0.05  bias :  0.005
15||1.10447613708   20||1.15486679336   25||1.52789906192   
regularization :  0.05  bias :  0.002
15||0.941254990968   20||1.0358177903   25||1.45308509002   
regularization :  0.05  bias :  0.01
15||0.9958792156   20||1.17675987434   25||1.28278623299   
regularization :  0.05  bias :  0.02
15||0.953456768683   20||1.04811593873   25||1.23977754099   
regularization :  0.02  bias :  0.001
15||0.979408311318   20||0.983514337539   25||0.988571619378   
regularization :  0.02  bias :  0.005
15||1.19294864039   20||1.40925861834   25||1.34915227895   
regularization :  0.02  bias :  0.002
15||0.981908301468   20||1.28495264158   25||1.67735246476   
regularization :  0.02  bias :  0.01
15||1.20768878809   20||1.50119088556   25||1.34032098212   
regularization :  0.02  bias :  0.02
15||1.00069139131   20||1.20860409605   25||1.50071317101   
regular

the best solution is obtained for the model with the parameters
number of factors = 20
regularization = 0.1/0.05
and learning rate for bias = 0.001
now to learn the model with these parameters

In [7]:
start_time = int(time.time())
trainset = dataset.build_full_trainset()
algo = pa.matrix_factorization.NMF(n_factors=20, n_epochs=100, biased=True, reg_pu=0.1, reg_qi=0.1, reg_bu=0.1, reg_bi=0.1, lr_bu=0.001, lr_bi=0.001)
algo.train(trainset)
end_time = int(time.time())
print (start_time-end_time)
testset = dataset.construct_testset(B_raw_ratings)
predictions = algo.test(testset)
print 'Unbiased accuracy on B,', accuracy.rmse(predictions)
accuracy.rmse(predictions)
accuracy.mae(predictions)
accuracy.fcp(predictions)
dump.dump('./nmf_algo',predictions,algo)

-3
Unbiased accuracy on B,RMSE: 0.9292
 0.929151467398
RMSE: 0.9292
MAE:  0.7383
FCP:  0.7112
