In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf

In [16]:
import os
import random

In [4]:

def printStats(preds):
    """compute some statistics (RMSE, coverage...) on a list of predictions"""

    if not preds:
        print("looks like there's no prediction...")
        return

    nOK = nKO = nImp = 0
        
    sumSqErr = 0
    sumAbsErr = 0
    
    nRecoOK = nRecoKO = 0


    threshold = 4. # we recommend m to u iff estimation >= threshold

    for p in preds:

        sumSqErr += (p['r0'] - p['est'])**2
        sumAbsErr += abs(p['r0'] - p['est'])

        if p['est'] >= threshold: # we recommend m to u
            if p['r0'] >= threshold: # we did well
                nRecoOK += 1
            else: # we shouldn't have...
                nRecoKO += 1

        if p['est'] == p['r0']:
            nOK += 1
        else:
            nKO += 1
        if p['wasImpossible']:
            nImp += 1

    rmse = np.sqrt(sumSqErr / (nOK + nKO))
    mae = np.sqrt(sumAbsErr / (nOK + nKO))
    accRate = nOK / (nOK + nKO)
    precision = nRecoOK / (nRecoOK + nRecoKO)
    recall = nRecoOK / sum(True for p in preds if p['r0'] >= threshold)

    print('Nb impossible predictions:', nImp)
    print('RMSE: {0:1.4f}'.format(rmse))
    print('MAE: {0:1.4f}'.format(mae))
    print('sample size:', len(preds))
    print('Accuracy rate: {0:1.4f}'.format(accRate))
    print('Precision: {0:1.2f}'.format(precision))
    print('recall: {0:1.2f}'.format(recall))

In [42]:



# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
#data = Dataset.load_builtin('ml-100k')
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)




In [43]:
first_train = None
first_test = None
for trainset, testset in data.folds():
    first_train, first_test = trainset, testset

In [44]:
first_train.n_items

1624

In [None]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)


In [6]:

    
# train and test algorithm.
algo.train(first_train)
predictions = algo.test(first_test)



TypeError: printStats() takes exactly 1 argument (0 given)

In [7]:
printStats(predictions)

TypeError: tuple indices must be integers, not str

In [2]:
dir(algo)

['__doc__',
 '__init__',
 '__module__',
 '__qualname__',
 'bi',
 'biased',
 'bsl_options',
 'bu',
 'compute_baselines',
 'compute_similarities',
 'estimate',
 'lr_bi',
 'lr_bu',
 'lr_pu',
 'lr_qi',
 'n_epochs',
 'n_factors',
 'predict',
 'pu',
 'qi',
 'reg_bi',
 'reg_bu',
 'reg_pu',
 'reg_qi',
 'sgd',
 'sim_options',
 'test',
 'train',
 'trainset',
 'verbose']

In [4]:
help(algo.predict)

Help on method predict in module surprise.prediction_algorithms.algo_base:

predict(self, uid, iid, r_ui, clip=True, verbose=False) method of surprise.prediction_algorithms.matrix_factorization.SVD instance
    Compute the rating prediction for given user and item.
    
    The ``predict`` method converts raw ids to inner ids and then calls the
    ``estimate`` method which is defined in every derived class. If the
    prediction is impossible (for whatever reason), the prediction is set
    to the global mean of all ratings.
    
    Args:
        uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
        iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
        r_ui(float): The true rating :math:`r_{ui}`.
        clip(bool): Whether to clip the estimation into the rating scale.
            For example, if :math:`\hat{r}_{ui}` is :math:`5.5` while the
            rating scale is :math:`[1, 5]`, then :math:`\hat{r}_{ui}` is
            set to :math:`5`. Sam

In [5]:
help(algo.train)

Help on method train in module surprise.prediction_algorithms.matrix_factorization:

train(...) method of surprise.prediction_algorithms.matrix_factorization.SVD instance



In [14]:

def extract_data_from_file_and_generate_train_and_test(filename, train_percent, seed, delimiter, test_data_inner_ratio, sort_by_time=False):
    test = None
    train = None
    data = {}
    random.seed(seed)

    with open(filename , 'r') as f:
        first_line = f.readline()
        for i, line in enumerate(f):
            userId, movieId, rating, timestamp = line.split(delimiter)
            #userId = int(userId)
            #movieId = int(movieId)
            rating = float(rating)
            timestamp = int(timestamp)

            if userId not in data:
                data[userId] = []
            data[userId].append((movieId, rating, timestamp))

    test = {}
    train = {}
    for userId in data:
        total_len = len(data[userId])
        if random.random() >= train_percent:
        #if 0 == random.randint(0, 2):
        #if 2 == random.randint(0, 2):
            test[userId] = data[userId]
        else:
            train[userId] = data[userId]
    userId = None

    for userId in test:
        test[userId].sort(key=lambda x: x[2])
    # sort by time: PART 1<begin>
    if sort_by_time:
        print 'sort by time PART 1'
        for userId in train:
            train[userId].sort(key=lambda x: x[2])

    #print train[train.keys()[0]]
    #print test[test.keys()[0]]
    #raw_input()
    # sort by time: PART 1 <end>

    ### split test data further
    test_real = {}
#    if not sort_by_time:
#        for k_user in test:
#            test_real[k_user] = [[], []]
#            for m, r, t in test[k_user]:
#                # every record of test dataset is supposed to be splitted into 2 parts: input part and fact/answer part
#                # How to specify the relative of these parts? 
#                #  Assign num_of_parts_of_test_data an appropriate value: each record of test dataset is supposed
#                # to constitute num_of_parts_of_test_data parts, and one of them would serve as the fact/answer part.
#                if 0 == random.randint(0, num_of_parts_of_test_data):
#                    test_real[k_user][1].append((m, r, t)) # the fact/answer part in one record of test dataset
#                else:
#                    test_real[k_user][0].append((m, r, t)) # the input part in one record of test dataset
#    else: 
#        # sort by time: PART 2 <start>
#        print 'sort by time PART 2'
    for k_user in test:
        #print 'len(test[k_user]):', len(test[k_user])
        #print 'num_of_parts_of_test_data:', num_of_parts_of_test_data
        #raw_input()
        #print (len(test[k_user]) * (1.0 / num_of_parts_of_test_data))
        #print (int)(len(test[k_user]) * (1.0 / num_of_parts_of_test_data))
        split_point_index = -1 * ((int)(len(test[k_user]) * test_data_inner_ratio))
        #print 'split_point_index:', split_point_index
        test_real[k_user] = [test[k_user][:split_point_index], test[k_user][split_point_index:]]
        #raw_input()

        # sort by time: PART 2 <end>

    #print test_real[test_real.keys()[0]]
    print 'sort_by_time:', sort_by_time

    #raw_input('pause')

    return train, test_real


In [138]:
data_filename, delimiter, data_set = os.path.sep.join(['ml-1m', 'ratings.dat']), '::', '1M'
#data_filename, delimiter = os.path.sep.join(['ml-10M100K', 'ratings.dat']), '::'
#data_filename, delimiter, data_set = os.path.sep.join(['ml-100k', 'u.data']), '\t', '100K'

seed = 2 
K = 10
train_percent = 0.8
test_data_inner_ratio = 0.8
train, test = extract_data_from_file_and_generate_train_and_test(data_filename, train_percent, seed, delimiter, test_data_inner_ratio)
#train, test = extract_data_from_file_and_generate_train_and_test(data_filename, 3, 0, seed, delimiter)


sort_by_time: False


In [18]:
train[train.keys()[0]]

[('593', 5.0, 966034312),
 ('1466', 3.0, 966034312),
 ('2289', 4.0, 965935347),
 ('1639', 4.0, 966034312),
 ('1649', 3.0, 966034407),
 ('1653', 1.0, 965934836),
 ('3260', 4.0, 966034245),
 ('223', 4.0, 965934892),
 ('3299', 2.0, 965935025),
 ('246', 5.0, 966034365),
 ('3618', 3.0, 965935175),
 ('2858', 5.0, 965934965),
 ('608', 4.0, 966034214),
 ('1179', 3.0, 966034271),
 ('1191', 4.0, 966034459),
 ('1197', 4.0, 965934862),
 ('1361', 5.0, 966034386),
 ('1396', 3.0, 965934862),
 ('3160', 5.0, 966034271),
 ('3176', 4.0, 965934965),
 ('1719', 5.0, 966034312),
 ('3186', 4.0, 965934982),
 ('2599', 4.0, 965935330),
 ('25', 5.0, 966034244),
 ('36', 4.0, 966034312),
 ('1221', 3.0, 965934836),
 ('2028', 4.0, 965934892),
 ('3783', 4.0, 965935142)]

In [26]:
print test.keys()[0]
test[test.keys()[0]]

1890


[[('1210', 4.0, 974691999),
  ('527', 4.0, 974692117),
  ('587', 3.0, 974692209),
  ('1', 4.0, 974692209),
  ('3813', 2.0, 974692209),
  ('1193', 4.0, 974692209),
  ('3893', 4.0, 974692355),
  ('3952', 5.0, 974692538),
  ('3943', 4.0, 974692627),
  ('1213', 5.0, 974692781),
  ('608', 5.0, 974693077),
  ('994', 4.0, 974693119),
  ('1233', 4.0, 974693165),
  ('2501', 3.0, 974693779),
  ('318', 4.0, 974693779),
  ('1635', 3.0, 974693827),
  ('1719', 3.0, 974693827),
  ('17', 3.0, 974693917),
  ('1041', 4.0, 974693917),
  ('1465', 3.0, 974694043),
  ('3418', 4.0, 974694043),
  ('41', 4.0, 974694043),
  ('2289', 4.0, 974694656),
  ('300', 4.0, 974694656),
  ('1179', 3.0, 974694707),
  ('2890', 4.0, 974694772),
  ('3163', 3.0, 974694772),
  ('314', 4.0, 974694772),
  ('800', 5.0, 974694814),
  ('1537', 3.0, 974695341),
  ('2396', 4.0, 974695341),
  ('1265', 4.0, 974695414),
  ('232', 4.0, 974695414),
  ('3114', 4.0, 974695414),
  ('2359', 4.0, 974695414),
  ('34', 1.0, 974695414),
  ('1188',

In [91]:
print len(test[test.keys()[0]])

2


In [78]:
train_file_name = 'tmp__svd_train'
test_file_name = 'tmp__svd_test'
my_sep = ':'

with open(train_file_name , 'w') as f:
     # format: userId, movieId, rating, timestamp
    for userId in train:
        for movieId, rating, timestamp in train[userId]:
            f.write(my_sep.join(map(str, [userId, movieId, rating, timestamp])) + '\n')
    for userId in test:
        for movieId, rating, timestamp in test[userId][0]:
            f.write(my_sep.join(map(str, [userId, movieId, rating, timestamp])) + '\n')

with open(test_file_name, 'w') as f:
    for userId in test:
        for movieId, rating, timestamp in test[userId][1]:
            f.write(my_sep.join(map(str, [userId, movieId, rating, timestamp])) + '\n')

In [31]:
help(Reader)

Help on class Reader in module surprise.dataset:

class Reader
 |  The Reader class is used to parse a file containing ratings.
 |  
 |  Such a file is assumed to specify only one rating per line, and each line
 |  needs to respect the following structure: ::
 |  
 |      user ; item ; rating ; [timestamp]
 |  
 |  where the order of the fields and the seperator (here ';') may be
 |  arbitrarily defined (see below).  brackets indicate that the timestamp
 |  field is optional.
 |  
 |  
 |  Args:
 |      name(:obj:`string`, optional): If specified, a Reader for one of the
 |          built-in datasets is returned and any other parameter is ignored.
 |          Accepted values are 'ml-100k', 'ml-1m', and 'jester'. Default
 |          is ``None``.
 |      line_format(:obj:`string`): The fields names, in the order at which
 |          they are encountered on a line. Example: ``'item user rating'``.
 |      sep(char): the separator between fields. Example : ``';'``.
 |      rating_scale(:ob

In [46]:
print train_file_name

tmp__svd_train


In [70]:
print train_file_name
print test_file_name

tmp__svd_train
tmp__svd_test


In [79]:
reader = Reader(line_format='user item rating timestamp', sep=my_sep)

data = Dataset.load_from_folds([(train_file_name, test_file_name)], reader=reader)

In [65]:
data

<surprise.dataset.DatasetUserFolds instance at 0x98f7c34c>

In [66]:
dir(data)

['__doc__',
 '__init__',
 '__module__',
 'construct_testset',
 'construct_trainset',
 'folds',
 'folds_files',
 'load_builtin',
 'load_from_file',
 'load_from_folds',
 'raw_folds',
 'read_ratings',
 'reader']

In [69]:
help(data.folds)

Help on method folds in module surprise.dataset:

folds(self) method of surprise.dataset.DatasetUserFolds instance
    Generator function to iterate over the folds of the Dataset.
    
    See :ref:`User Guide <iterate_over_folds>` for usage.
    
    Yields:
        tuple: :class:`Trainset` and testset of current fold.



In [80]:

for trainset, testset in data.folds():
    first_train, first_test = trainset, testset

In [82]:
print 'info of train:'
print first_train.n_users
print first_train.n_items
print first_train.n_ratings

print 'info of test:'
print len(first_test)

info of train:
6040
3698
840387
info of test:
159821


In [125]:
starttime = datetime.datetime.now()

###
# We'll use the famous SVD algorithm.
algo = SVD()

# train algorithm.
algo.train(first_train)

###
endtime = datetime.datetime.now()
interval = (endtime - starttime).seconds
print 'svd time consumption: %d' % (interval)


svd time consumption: 40


In [139]:
len(train)

4840

In [145]:
all_user_in_test = set([x[0] for x in first_test])
print len(all_user_in_test)


def extract_all_items(filename, delimiter):
    all_items = []
    with open(filename , 'r') as f:
        first_line = f.readline()
        for i, line in enumerate(f):
            userId, movieId, rating, timestamp = line.split(delimiter)
            
            all_items.append(movieId)
    return set(all_items)

all_item_in_test = None
all_items = extract_all_items(data_filename, delimiter)
print len(all_items)

1200
3706


In [165]:
#user_item_interaction = {}
#def extract_all_user_item_interaction(filename, delimiter):
#    ret = coll.defaultdict(set)
#    with open(filename , 'r') as f:
#        first_line = f.readline()
#        for i, line in enumerate(f):
#            userId, movieId, rating, timestamp = line.split(delimiter)
#            
#            ret[userId].add(movieId)
#    return ret
#
#user_item_interaction = extract_all_user_item_interaction(data_filename, delimiter)

user_item_interaction = None
user_item_interaction_in_history = {}#coll.defaultdict(dict)
for u in train:
    user_item_interaction_in_history[u] = set([x[0] for x in train[u]])
for u in test:
    user_item_interaction_in_history[u] = set([x[0] for x in test[u][0]])

In [180]:
user_item_interaction_in_history[user_item_interaction_in_history.keys()[0]]

{'1079',
 '110',
 '1197',
 '1198',
 '1213',
 '150',
 '1719',
 '2357',
 '265',
 '2858',
 '296',
 '318',
 '3246',
 '3552',
 '356',
 '608'}

In [185]:
for x in all_items:
    print type(x)
    break

<type 'str'>


In [166]:
print len(user_item_interaction_in_history)

6040


In [164]:
'3198' in user_item_interaction_in_history.keys()

False

In [198]:
#predictions = algo.test([('4', '13', -1)])
predictions_with_filtering = None
predictions_without_filtering = None

mid = [(uid, iid, -1) 
                         for iid in filter(lambda x: x not in user_item_interaction_in_history[uid], all_items)
                         for uid in all_user_in_test]
predictions = algo.test(mid)
#predictions_without_filtering = algo.test([(uid, iid, -1) 
#                         for iid in all_items
#                         for uid in all_user_in_test])

In [176]:
predictions = None

In [171]:
len([(uid, iid, -1) 
                         for iid in all_items
                         for uid in all_user_in_test])

4447200

In [168]:
len(predictions)

4422000

In [154]:
p = predictions[0]
print p

user: 4303       item: 1718       r_ui = -1.00   est = 3.72   {u'was_impossible': False}


In [156]:
p.uid

'4303'

In [87]:
dir(p)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__getslice__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_fields',
 '_make',
 '_replace',
 'count',
 'details',
 'est',
 'iid',
 'index',
 'r_ui',
 'uid']

In [88]:
first_test[0]

('1890', '2355', 4.0)

In [89]:
all_user_in_test = set([x[0] for x in first_test])
len(all_user_in_test)

1200

In [202]:
N = 20

import collections as coll
rec = coll.defaultdict(list)

for p in predictions:
    uid, iid, est = p.uid, p.iid, p.est
    #print uid, iid, est
    
    rec[uid].append((iid, est))
    
for u in rec:
    rec[u].sort(key=lambda x: -1 * x[1])

for u in rec:
    #print rec[u]
    #break
    
    rec[u] = rec[u][:N]



In [151]:
import datetime
def calculate_metrics(test, rec):
    starttime = datetime.datetime.now()
    hit = 0

    all__for_recall = 0
    all__for_precision = 0
    for user in test.keys():
        history = test[user][0]
        answer = test[user][1]
        tu = [x[0] for x in answer]
        rank = rec[user] # self.recommend(history, N)
        #print 'rank:', rank
        for item, pui in rank:
            if item in tu:
                hit += 1
        all__for_recall += len(tu)
        all__for_precision += len(rank) #Note: In book RSP, the author used 'all += N'

    metric_recall = None
    metric_precision = None
    metric_f1 = None
    if 0 == all__for_recall:
        metric_recall = 0
    else:
        metric_recall = hit / (all__for_recall * 1.0)

    if 0 == all__for_precision:
        metric_precision = 0
    else:
        metric_precision = hit / (all__for_precision * 1.0)

    if 0 == all__for_recall or 0 == all__for_precision:
        metric_f1 = 0
    else:
        metric_f1 = 2/(1./metric_precision + 1./metric_recall)

    endtime = datetime.datetime.now()
    interval = (endtime - starttime).seconds
    print 'metric calculation: time consumption: %d' % (interval)
    return {'recall': metric_recall, 'precision': metric_precision, 'f1': metric_f1}

In [179]:
calculate_metrics(test, rec)

metric calculation: time consumption: 0


{'f1': 0.019725711425789216,
 'precision': 0.07554166666666666,
 'recall': 0.011343941034031822}

In [203]:
# after removing already interacted items
calculate_metrics(test, rec)

metric calculation: time consumption: 0


{'f1': 0.018148089717714515,
 'precision': 0.0695,
 'recall': 0.010436676031310028}

In [188]:
for u in all_user_in_test:
    print u
    p_for_1890 = filter(lambda x: x.uid == u, predictions)
    #print len(p_for_1890)

    p_for_1890.sort(key=lambda x: -1 * x.est)
    p_for_1890 = p_for_1890[:N]
    set_p = set([x.iid for x in p_for_1890])
    #print set_p

    set_history = set([x[0] for x in test[u][0]])
    #print set_fact

    print len(set_p.intersection(set_history))
    break

4303
3


In [199]:
print len([(uid, iid, -1) 
                         for iid in all_items
                         for uid in ['4303']])
print len([(uid, iid, -1) 
                         for iid in filter(lambda x: x not in user_item_interaction_in_history[uid], all_items)
                         for uid in ['4303']])

3706
3631


In [200]:
len(filter(lambda x: x[0] == '4303', predictions))

3631

In [192]:
user_item_interaction_in_history['4303']

{'1084',
 '1090',
 '1094',
 '1104',
 '111',
 '1193',
 '1196',
 '1203',
 '1207',
 '1210',
 '1213',
 '1221',
 '1225',
 '1228',
 '1244',
 '1247',
 '1250',
 '1263',
 '1272',
 '1276',
 '1277',
 '1299',
 '1411',
 '1594',
 '1735',
 '1916',
 '1944',
 '1946',
 '1950',
 '1952',
 '1956',
 '2020',
 '2028',
 '2345',
 '2397',
 '25',
 '2718',
 '2804',
 '2858',
 '3006',
 '3019',
 '3038',
 '3067',
 '3090',
 '3095',
 '3111',
 '3152',
 '318',
 '3196',
 '3198',
 '32',
 '3363',
 '34',
 '3418',
 '3467',
 '3468',
 '3498',
 '3543',
 '356',
 '3730',
 '3735',
 '3741',
 '3765',
 '3811',
 '425',
 '515',
 '593',
 '608',
 '621',
 '908',
 '919',
 '920',
 '924',
 '949',
 '953'}

In [191]:
test['4303'][0]

[('1210', 4.0, 965265558),
 ('1104', 5.0, 965265665),
 ('356', 4.0, 965265665),
 ('1735', 4.0, 965265665),
 ('2397', 2.0, 965265665),
 ('3765', 3.0, 965265806),
 ('3111', 5.0, 965265822),
 ('3095', 5.0, 965265865),
 ('3730', 3.0, 965265865),
 ('3741', 4.0, 965265865),
 ('1411', 3.0, 965265908),
 ('2804', 5.0, 965265908),
 ('1946', 4.0, 965265908),
 ('425', 4.0, 965265908),
 ('919', 5.0, 965265939),
 ('920', 5.0, 965265939),
 ('3468', 5.0, 965265939),
 ('2858', 5.0, 965265939),
 ('1952', 5.0, 965265939),
 ('318', 4.0, 965265961),
 ('1207', 5.0, 965265961),
 ('1244', 4.0, 965265961),
 ('1272', 4.0, 965265961),
 ('593', 5.0, 965265984),
 ('3038', 5.0, 965265984),
 ('908', 4.0, 965265984),
 ('608', 4.0, 965265984),
 ('1193', 5.0, 965266006),
 ('1956', 5.0, 965266006),
 ('1221', 5.0, 965266006),
 ('3006', 4.0, 965266025),
 ('1944', 4.0, 965266025),
 ('1225', 4.0, 965266025),
 ('1276', 4.0, 965266025),
 ('3467', 4.0, 965266043),
 ('3196', 5.0, 965266043),
 ('1594', 4.0, 965266043),
 ('1084',

In [104]:
len(predictions)

159821

In [110]:
predictions[200].uid == '1890'

False

In [124]:
for u in all_user_in_test:
    p_for_1890 = filter(lambda x: x.uid == u, predictions)
    #print len(p_for_1890)

    p_for_1890.sort(key=lambda x: -1 * x.est)
    p_for_1890 = p_for_1890[:N]
    set_p = set([x.iid for x in p_for_1890])
    #print set_p

    set_fact = set([x[0] for x in test[u][1]])
    #print set_fact

    print set_p - set_fact

set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])
set([])


In [129]:
filter(lambda u: len(set([x[0] for x in test[u][1]])) < 20, all_user_in_test)


['4943',
 '1993',
 '2311',
 '1664',
 '197',
 '276',
 '2984',
 '2381',
 '1336',
 '5772',
 '3969',
 '2613',
 '3254',
 '5116',
 '1907',
 '2686',
 '98',
 '4178',
 '4073',
 '1310',
 '1174',
 '4254',
 '2530',
 '21',
 '3939',
 '1452',
 '2527',
 '790',
 '5314',
 '2834',
 '4547',
 '2884',
 '653',
 '2111',
 '4159',
 '4008',
 '3236',
 '1967',
 '1612',
 '2339',
 '4752',
 '2268',
 '4880',
 '4881',
 '4228',
 '5392',
 '5391',
 '311',
 '3275',
 '3273',
 '4211',
 '833',
 '3570',
 '1986',
 '4703',
 '5876',
 '4492',
 '2584',
 '5295',
 '5725',
 '3225',
 '3222',
 '1783',
 '5192',
 '254',
 '4',
 '1674',
 '2037',
 '1892',
 '6012',
 '5027',
 '665',
 '5122',
 '5159',
 '2920',
 '4749',
 '2816',
 '5606',
 '4944',
 '600',
 '158',
 '5168',
 '4991',
 '5735',
 '4393',
 '5258']

In [130]:
test['4'][1]

[('260', 5.0, 978294199),
 ('1196', 2.0, 978294199),
 ('1198', 5.0, 978294199),
 ('1387', 5.0, 978294199),
 ('2028', 5.0, 978294230),
 ('2366', 4.0, 978294230),
 ('1201', 5.0, 978294230),
 ('2692', 5.0, 978294230),
 ('2947', 5.0, 978294230),
 ('1214', 4.0, 978294260),
 ('3418', 4.0, 978294260),
 ('3702', 4.0, 978294260),
 ('1240', 5.0, 978294260),
 ('2951', 4.0, 978294282),
 ('1036', 4.0, 978294282),
 ('1954', 5.0, 978294282)]

In [132]:
r = filter(lambda x: x[0] == '4', predictions)
len(r)

16

In [52]:
#from surprise import BaselineOnly
from surprise import Dataset
#from surprise import evaluate
from surprise import Reader

# path to dataset file
#file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
file_path = train_file_name

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep=':')

data = Dataset.load_from_file(file_path, reader=reader)
my_train = data.build_full_trainset()

###
file_path = test_file_name
reader = Reader(line_format='user item rating timestamp', sep=':')
data = Dataset.load_from_file(file_path, reader=reader)
my_test = data.build_full_trainset()

In [35]:
print dir(my_test)

['__doc__', '__init__', '__module__', '_global_mean', '_raw2inner_id_items', '_raw2inner_id_users', 'all_items', 'all_ratings', 'all_users', 'global_mean', 'ir', 'knows_item', 'knows_user', 'n_items', 'n_ratings', 'n_users', 'offset', 'rating_scale', 'to_inner_iid', 'to_inner_uid', 'ur']


In [45]:
print my_train.n_items
print my_test.n_items

1
1


In [None]:

# We'll use the famous SVD algorithm.
algo = SVD()


    
# train and test algorithm.
algo.train(my_train)
predictions = algo.test(my_test)

In [2]:
# from https://github.com/NicolasHug/Surprise/blob/master/examples/load_custom_dataset.py

import os

#from surprise import BaselineOnly
from surprise import Dataset
#from surprise import evaluate
from surprise import Reader

# path to dataset file
#file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
file_path = '/home/wsyj/dissertation__recommendation_system__experiment_2/' + 
            'dissertation__recommendation_system__experiment/ml-1m/ratings.dat'

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep='::')

data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=5)

## We'll use an algorithm that predicts baseline estimates.
#algo = BaselineOnly()
#
## Evaluate performances of our algorithm on the dataset.
#evaluate(algo, data)

In [8]:

# We'll use the famous SVD algorithm.
algo = SVD()

first_train = None
first_test = None
for trainset, testset in data.folds():
    first_train, first_test = trainset, testset
    
# train and test algorithm.
algo.train(first_train)
predictions = algo.test(first_test)


In [9]:
trainset

<surprise.dataset.Trainset instance at 0xb68b15ec>

In [10]:
data

<surprise.dataset.DatasetAutoFolds instance at 0xb1fadaac>

In [4]:
predictions[0]

Prediction(uid='2340', iid='1344', r_ui=4.0, est=4.6037187047460844, details={u'was_impossible': False})

In [5]:
first_test[0]

('2340', '1344', 4.0)

In [6]:
for i in first_test:
    print i

('2340', '1344', 4.0)
('3770', '1234', 5.0)
('1441', '539', 4.0)
('5401', '345', 5.0)
('3562', '1968', 4.0)
('3752', '2081', 5.0)
('3380', '2303', 4.0)
('1753', '3256', 3.0)
('5831', '2349', 4.0)
('2996', '69', 2.0)
('5092', '1035', 4.0)
('2681', '410', 3.0)
('1422', '3873', 3.0)
('4277', '3514', 4.0)
('1916', '1294', 5.0)
('5472', '2763', 1.0)
('2765', '2490', 3.0)
('1984', '3671', 4.0)
('4049', '353', 3.0)
('746', '1799', 2.0)
('2407', '1230', 4.0)
('3724', '1580', 4.0)
('4451', '2952', 4.0)
('3143', '17', 4.0)
('2484', '2599', 3.0)
('2302', '3740', 4.0)
('1255', '1660', 4.0)
('3483', '3683', 4.0)
('3434', '2324', 5.0)
('1392', '1219', 5.0)
('1274', '110', 3.0)
('651', '3698', 2.0)
('2176', '260', 5.0)
('5005', '1292', 3.0)
('3134', '1274', 4.0)
('10', '2291', 4.0)
('2091', '1304', 4.0)
('4416', '3360', 5.0)
('5919', '2428', 4.0)
('1707', '1094', 5.0)
('2958', '3159', 4.0)
('1722', '1341', 4.0)
('4450', '2641', 1.0)
('5689', '110', 5.0)
('4821', '349', 3.0)
('3521', '898', 4.0)
('320

In [7]:
dir(data)


['__doc__',
 '__init__',
 '__module__',
 'build_full_trainset',
 'construct_testset',
 'construct_trainset',
 'folds',
 'load_builtin',
 'load_from_file',
 'load_from_folds',
 'n_folds',
 'ratings_file',
 'raw_folds',
 'raw_ratings',
 'read_ratings',
 'reader',
 'shuffle',
 'split']

In [8]:
print data

<surprise.dataset.DatasetAutoFolds instance at 0xa93b6e0c>
