# Данные

In [2]:
import csv
from collections import defaultdict, Counter
import random
import json
from metrics import apk
random.seed(42)

In [3]:
from sklearn.ensemble import *
import numpy as np

In [4]:
import scipy as sp
import scipy.sparse
import scipy.sparse.linalg

In [5]:
import lasagne
import theano
import theano.tensor as T

In [6]:
#Словари для основной выборки
user_to_items = defaultdict(set)
item_to_users = defaultdict(set)
edges = []
#Словари для тестовой выборки
test_user_to_items = defaultdict(set)
test_item_to_users = defaultdict(set)
test_edges = []

with open("data/train_likes.csv") as datafile:
    for like in csv.DictReader(datafile):
        # Кидаем монетку. В зависимости от результата кладём в обучение или тест
        if random.random() < 0.90:
            user_to_items[like['user_id']].add(like['item_id'])
            item_to_users[like['item_id']].add(like['user_id'])
            edges.append((like['user_id'], like['item_id']))
        else:
            test_user_to_items[like['user_id']].add(like['item_id'])
            test_item_to_users[like['item_id']].add(like['user_id'])
            test_edges.append((like['user_id'], like['item_id']))

In [7]:
all_items = set(item_to_users.keys()) | set(test_item_to_users.keys())
all_users = set(user_to_items.keys()) | set(test_user_to_items.keys())

In [8]:
users_list = list(user_to_items.keys())
test_users_list = list(test_user_to_items.keys())

In [9]:
user_to_i = {user: i for i, user in enumerate(all_users)}
item_to_i = {item: i for i, item in enumerate(all_items)}
all_users_list = list(all_users)
all_items_list = list(all_items)

In [10]:
matrix = sp.sparse.lil_matrix((len(all_users), len(all_items)))
test_matrix = sp.sparse.lil_matrix((len(all_users), len(all_items)))
for user, items in user_to_items.items():
    for item in items:
        matrix[user_to_i[user], item_to_i[item]] = True
for user, items in test_user_to_items.items():
    for item in items:
        test_matrix[user_to_i[user], item_to_i[item]] = True
matrix = matrix.tocsr()
test_matrix = test_matrix.tocsr()

In [11]:
matrix

<55863x23891 sparse matrix of type '<class 'numpy.float64'>'
	with 99582 stored elements in Compressed Sparse Row format>

In [12]:
u, s, vt = sp.sparse.linalg.svds(matrix.astype(np.float32), k=100)

In [13]:
u.shape, s.shape, vt.shape

((55863, 100), (100,), (100, 23891))

In [14]:
films = json.load(open('data/items.json'))
films = {a['id']:a for a in films}
for a in films.values():
    del a['id']

In [15]:
for f in films.values():
    if 'genre' in f:
        f[f['genre']] = 1
        del f['genre']

In [16]:
features = Counter()
for f in films.values():
    features.update(set(f.keys()))

In [17]:
features.most_common(20)

[('year', 138078),
 ('duration', 138078),
 (0, 34128),
 (3, 32154),
 ('f_114306', 30650),
 (1, 28635),
 ('f_79251', 16647),
 ('f_117573', 15295),
 ('f_138980', 15215),
 (5, 15008),
 ('f_64513', 14020),
 ('f_205162', 10693),
 ('f_122038', 10436),
 ('f_84602', 8377),
 ('f_68894', 8348),
 ('f_127793', 7946),
 ('f_72071', 7866),
 (4, 7386),
 ('f_210900', 7085),
 ('f_130202', 6371)]

In [18]:
small_features_size = 20 

In [19]:
small_features = set([_[0] for _ in features.most_common(small_features_size)])

In [20]:
feature_to_i = {feature: i for i, feature in enumerate(small_features)}

In [21]:
feature_to_i

{0: 0,
 1: 1,
 3: 3,
 'f_68894': 6,
 5: 5,
 'f_84602': 2,
 'f_79251': 9,
 'f_114306': 10,
 'f_117573': 7,
 'year': 13,
 'f_210900': 14,
 'f_205162': 19,
 'f_138980': 4,
 'f_64513': 11,
 'f_72071': 15,
 4: 8,
 'f_122038': 16,
 'f_127793': 17,
 'duration': 18,
 'f_130202': 12}

# Фильтрация пользователей
* Значительная часть пользователей имеет всего 1-2 просмотра. При всём желании, рекоммендовать им что-либо осмысленное при помощи рассматриваемого здесь метода мы вряд ли сможем. Для простоты вычислений, удалим их из выборки.
* Важно понимать, что качество на оставшихся пользователях скорее всего будет выше, чем на первоначальной выборке.

In [22]:
min_items_per_user = 2
from copy import copy
for user in copy(test_user_to_items).keys():
    
    n_items_per_user = len(user_to_items[user]) + len(test_user_to_items[user])
    
    if n_items_per_user <= min_items_per_user:
        del user_to_items[user]
        del test_user_to_items[user]

### Рекоммендующая функция
Позволим себе немного вольности: наша функция будет возвращать не вероятности, а список фильмов в порядке убывания "рекомендованности".

* Рекоммендованность фильма item пользователю user посчитаем так:
  * Для каждого фильма, полайканного пользователем user, найдём других людей, которым понравился фильм.
  * Сложим всех таких "друзей по лайкам" вместе и назовём соседями (__neighborhood__) пользователя.
  * Для фильма item узнаем его аудиторию - множество пользователей, которые его лайкнули
  * Пригодность фильма пользователю - то, насколько "друзьям по лайкам" пользователя нравится этот фильм.

Для примера, будем использовать косинусную меру расстояния
  
$ cos(u_{film}, u_{neighborhood}) = $ =$ u_{film} \cdot u_{neighborhood} \over |u_{film}| |u_{neighborhood}| $


$u_{neighborhood}$ зависит только от пользователя, но не от фильма, поэтому при сравнении фильмов по пригодности для одного пользователя, его можно исключить из формулы для простоты вычислений.

$ similarity(u_{film}, u_{neighborhood}) = $ $  u_{film} \cdot u_{neighborhood} \over |u_{film}| $
  
  
Распишем формулу подробно:

$ similarity(u_{film}, u_{neighborhood}) = $ $ \sum _{u_i} [u_i \in u_{film}] \cdot [u_i \in u_{neighborhood}] \over |u_{film}|  $

* u_i - очередной пользователь (в цикле по всем пользователям)
  
Выражение $[u_i \in u_{neighborhood}]$ здесь означает "сколько раз очередной пользователь входит в множество друзей по лайкам"
  
  

In [23]:
from math import sqrt
from collections import Counter

def recommend(user, n_best = 10):
    user_items = user_to_items[user]
    
    neighborhood = Counter()
    for item in user_items:
        neighborhood.update(item_to_users[item])
    
    #словарь {фильм -> пригодность фильма пользователю}
    item_similarities = {}
    
    for item in all_items:
        if item in user_items: continue
        item_users = item_to_users[item]
        if len(item_users) == 0: continue
        
        n_common_users = sum(neighborhood[user] for user in item_users)
        similarity = float(n_common_users) / sqrt(len(item_users))
        item_similarities[item] = similarity
    
    items_sorted = sorted(all_items, key = lambda x: item_similarities.get(x, 0),reverse = True)
    
    return items_sorted[:n_best]

In [24]:
user_to_int = dict()
for i, user in enumerate(all_users):
    user_to_int[user] = i 

In [25]:
#Making dataset
user_features = defaultdict(lambda:defaultdict(lambda:0))
for user in list(user_to_items.keys())[0:]:
    for item in user_to_items[user]:
        if item in films:
            for feature, value in films[item].items():    
                user_features[user][feature]+=value
    for f in user_features[user]:
        user_features[user][f]/=len(user_to_items)

In [26]:
edges[0] in test_edges

False

In [27]:
features_size = 2*small_features_size + 1

In [28]:
def extract_features(user=None, item=None, X_sample = np.zeros(features_size, dtype='float32')):
    if user is not None:
        for f in user_features[user].keys()&small_features:
            X_sample[feature_to_i[f]] = user_features[user][f]
        if item is not None:
            X_sample[2*small_features_size+0] = user_film_mk2(user, item)
    if item is not None and item in films:
        for f in films[item].keys()&small_features:
            X_sample[small_features_size+feature_to_i[f]] = films[item][f]
    
    return X_sample

In [29]:
len(edges)

100068

In [38]:
def generate_random_samples(X_size = 100, use_test=False):
    X = np.zeros((X_size, features_size), dtype='float32')
    Y = np.zeros(X_size, dtype='int8')
    if use_test:
        local_users_list = test_users_list
        local_user_to_items = test_user_to_items
    else:
        local_users_list = users_list
        local_user_to_items = user_to_items
    for i in range(X_size):
        film =''
        while film not in films: 
            result = random.random()
            if result > 0.50:
                result = 1
            else:
                result = 0
            user = random.choice(local_users_list)
            if result==0:
                film = random.choice(all_items_list)
                if film in local_user_to_items[user]:
                    result = 1
            else:
                film = random.choice(list(local_user_to_items[user]|{''}))
        X[i] = extract_features(user, film)
        Y[i] = result
    return X, Y

In [86]:
def generate_samples(X_true = 100,  use_test=False):
    X_fake=X_true
    X = np.zeros((X_true+X_fake, features_size), dtype='float32')
    Y = np.zeros(X_true+X_fake, dtype='float32')
    if use_test:
        local_users_list = test_users_list
        local_user_to_items = test_user_to_items
        local_edges = test_edges 
    else: 
        local_users_list = users_list
        local_user_to_items = user_to_items
        local_edges = edges
    for i, (user, film)  in enumerate(local_edges[:X_true]):
        X[i] = extract_features(user, film)
        Y[i] = 1
    for i in range(X_true, X_true+X_fake):
        user = random.choice(local_users_list)
        film = random.choice(all_items_list)
        X[i] = extract_features(user, film)
        if film in local_user_to_items[user]:
            Y[i] = 1
    return X, Y

In [49]:
rf = RandomForestRegressor(n_estimators=500)

In [50]:
X, Y = generate_random_samples(100000)

In [51]:
rf.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [58]:
X, Y = generate_random_samples(100, use_test=False)
print("Train score: %s" %(rf.score(X, Y),))
X, Y = generate_random_samples(100, use_test=True)
print("Test score: %s" %(rf.score(X, Y),))

Train score: 0.878675890313
Test score: 0.202766900421


In [62]:
input_X = T.matrix("input X")
input_shape = [None, features_size]

target_y = T.matrix("target Y integer",dtype='float32')

In [63]:
#Input layer (auxilary)
layer = lasagne.layers.InputLayer(shape = input_shape,input_var=input_X)

#fully connected layer, that takes input layer and applies 50 neurons to it.
# nonlinearity here is sigmoid as in logistic regression
# you can give a name to each layer (optional)
layer = lasagne.layers.DenseLayer(layer,num_units=50,
                                   nonlinearity = lasagne.nonlinearities.tanh,
                                   name = "hidden_dense_layer")

#fully connected output layer that takes dense_1 as input and has 10 neurons (1 for each digit)
#We use softmax nonlinearity to make probabilities add up to 1
dense_output = lasagne.layers.DenseLayer(layer,num_units = 1 ,
                                        nonlinearity = lasagne.nonlinearities.rectify,
                                        name='output', b=None)

In [76]:
#network prediction (theano-transformation)
y_predicted = lasagne.layers.get_output(dense_output)
#all network weights (shared variables)
all_weights = lasagne.layers.get_all_params(dense_output, trainable=True)
print (all_weights)

[hidden_dense_layer.W, hidden_dense_layer.b, output.W]


In [83]:
loss = lasagne.objectives.squared_error(y_predicted, target_y)
loss = lasagne.objectives.aggregate(loss, mode = 'mean')

accuracy = lasagne.objectives.squared_error(y_predicted, target_y)
accuracy = lasagne.objectives.aggregate(accuracy, mode = 'mean')


#This function computes gradient AND composes weight updates just like you did earlier
updates_sgd = lasagne.updates.sgd(loss, all_weights,learning_rate=0.1)

#function that computes loss and updates weights
train_fun = theano.function([input_X,target_y],[loss,accuracy],updates= updates_sgd)

#function that just computes accuracy
accuracy_fun = theano.function([input_X,target_y],accuracy)

In [87]:
import time

num_epochs = 1000 #amount of passes through the data

batch_size = 1000 #number of samples processed at each function call

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in (generate_samples(batch_size, use_test=False) for _ in range(100)):
        inputs, targets = batch
        targets = targets.reshape((-1, 1))
        train_err_batch, train_acc_batch= train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in (generate_samples(batch_size, use_test=True) for _ in range(100)):
        inputs, targets = batch
        targets = targets.reshape((-1, 1))
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    
    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))

    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{} ".format(
        train_acc / train_batches ))
    print("  validation accuracy:\t\t{} ".format(
        val_acc / val_batches))

Epoch 1 of 1000 took 12.404s
  training loss (in-iteration):		0.500035
  train accuracy:		0.500035 
  validation accuracy:		0.5000150000000001 
Epoch 2 of 1000 took 12.282s
  training loss (in-iteration):		0.500040
  train accuracy:		0.50004 
  validation accuracy:		0.5000100000000001 
Epoch 3 of 1000 took 12.088s
  training loss (in-iteration):		0.500055
  train accuracy:		0.5000550000000001 
  validation accuracy:		0.5000100000000001 
Epoch 4 of 1000 took 12.794s
  training loss (in-iteration):		0.500040
  train accuracy:		0.50004 
  validation accuracy:		0.5 
Epoch 5 of 1000 took 13.136s
  training loss (in-iteration):		0.500020
  train accuracy:		0.5000200000000001 
  validation accuracy:		0.50001 


KeyboardInterrupt: 

In [89]:
X, Y = generate_random_samples(10, use_test=False)


(array([[ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]), array([0, 0, 1, 1, 0, 1, 1, 1, 0, 0], dtype=int8))

odict_items([(hidden_dense_layer.W, Elemwise{add,no_inplace}.0), (hidden_dense_layer.b, Elemwise{add,no_inplace}.0), (output.W, Elemwise{add,no_inplace}.0), (<TensorType(float64, matrix)>, Elemwise{sub,no_inplace}.0), (<TensorType(float64, vector)>, Elemwise{sub,no_inplace}.0), (<TensorType(float64, matrix)>, Elemwise{sub,no_inplace}.0)])

In [332]:
X, Y = generate_random_samples(10, use_test=True)
X[0], rf.predict(X), Y

(array([  3.98906996e-05,   0.00000000e+00,   5.98360493e-05,
          0.00000000e+00,   0.00000000e+00,   3.98906996e-05,
          0.00000000e+00,   7.97813991e-05,   1.99453498e-05,
          9.97267489e-05,   3.98906996e-05,   3.98906996e-05,
         -3.40250190e-05,   1.99453498e-05,   1.99453498e-05,
          8.33385002e-06,   0.00000000e+00,   0.00000000e+00,
          1.99453498e-05,   1.99453498e-05,   1.00000000e+00,
          1.00000000e+00,   2.00000000e+00,   1.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          2.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          1.00000000e+00,   2.00000000e+00,  -6.53573573e-01,
          2.00000000e+00,   1.00000000e+00,   4.65432197e-01,
          1.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          1.00000000e+00,   1.09823048e-03,   0.00000000e+00], dtype=float32),
 array([ 0.74074074,  0.09090909,  0.10869565,  0.06      ,  0.12727273,
         0.12962963,  0.11111111,  0.12121

In [34]:
def user_film_mk3(user, item, debug=False):
    X = np.zeros((1, features_size), dtype='float32')
    X[0] = extract_features(user, item)
    probs = rf.predict(X)
    if debug:
        print(probs)
    return probs[0]

In [35]:
def user_film_mk1(user, item, debug=False):
    #print(len(user_features[user]))
    cursum = 0
    curcnt = 0
    if item in films:
        for feature, value in films[item].items():
            if type(value) is int:
                curcnt+=1    
                cursum+=value*user_features[user][feature]
    if curcnt==0:
        curcnt+=1
    #cursum/=curcnt
    if debug:
        print(cursum)
    return cursum

In [36]:
def user_film_mk2(user, item, debug=False):
    ui = user_to_i[user]
    ii = item_to_i[item]
    if debug:
        print(ui, ii)
    a = np.dot(u[ui,:] * s, vt[:,ii])
    return a

In [285]:
user_film_mk3('26052b20aa96ed8d803dbfe4e9497192', '45ea2aa2143effcb6575daf0143e31b4', debug=True)

[ 0.]


0.0

In [321]:
def recommend_mk3_manual(user, n_best = 10, debug=False):
    user_items = user_to_items[user]

    item_similarities = {}
    cur_items = [item for item in all_items if item not in user_items and len(item_to_users[item])>0]
    
    X = np.zeros((len(cur_items), features_size), dtype='float32')
    X_user = extract_features(user)
    for i, item in enumerate(cur_items):
        X[i] = extract_features(user=None, item=item, X_sample=X_user)
    
    probs =rf.predict(X)
    item_similarities = {key: prob for key, prob in zip(cur_items, probs)}
    items_sorted = sorted(all_items, key = lambda x: item_similarities.get(x, 0),reverse = True)
    if debug:
        print(X)
        for a in  items_sorted[:n_best]:
            print((a, item_similarities.get(a, 0)))
    return items_sorted[:n_best]

In [322]:
def generic_recommend(user_flim_function):
    def recommend(user, n_best = 10, debug=False):
        user_items = user_to_items[user]

        item_similarities = {}
        for item in all_items:
            if item in user_items: continue
            item_users = item_to_users[item]
            if len(item_users) == 0: continue

            item_similarities[item] = user_flim_function(user, item)

        items_sorted = sorted(all_items, key = lambda x: item_similarities.get(x, 0),reverse = True)
        if debug:
            for a in  items_sorted[:n_best]:
                print((a, item_similarities.get(a, 0)))
        return items_sorted[:n_best]
    return recommend

In [123]:
generic_recommend(user_film_mk2)('1a337111f63f6a7f86cebf6b2ad3d732', debug=True)
user_film_mk2('1a337111f63f6a7f86cebf6b2ad3d732', 'c745ed11b94ed93f3008897c75240de3', debug=True)

('7682d6641caf183ffcf2616c9bb1e434', 0.08739379)
('b5e424608d041f38c2fd6125b3bcd40f', 0.062321573)
('006a602774c79d476cdbd5dff7dd24f1', 0.047887065)
('e9caaa12a277abf87ba07b84e09a4170', 0.045560129)
('aa5f2ca699da42e467e550f9f071fb3f', 0.044249389)
('bcee3a68dbfb6b4ea97ac53ceb1d3287', 0.044186898)
('e8047b4262e676d28f50816ac3fde1ca', 0.042720947)
('77c7998b3d3d41f3fc3aec786b4015ac', 0.038098648)
('1da6f84598fbeea050da49fe37297932', 0.037306551)
('2dc12b93072f94bca9438902bddc36c4', 0.035170436)
10976 651


0.0024645075

In [124]:
generic_recommend(user_film_mk1)('1a337111f63f6a7f86cebf6b2ad3d732', debug=True)
user_film_mk1('1a337111f63f6a7f86cebf6b2ad3d732', 'c745ed11b94ed93f3008897c75240de3', debug=True)

('e13df82b3252eaeefc0c13599695f9c7', 0.000658196541476355)
('634e622e5c957c2cd8ddb98ca75ef937', 0.0005584697927678163)
('8faa6327a2ca2eb92e738ffc204d35c0', 0.0005584697927678163)
('64b6536649e20bd72d7183ec717b6c86', 0.0005385244430261086)
('5c6d1862dc1e82f340962b308999e603', 0.0005185790932844008)
('bcee3a68dbfb6b4ea97ac53ceb1d3287', 0.0005185790932844007)
('ef786a839c609561b76ce03768eace80', 0.0005185790932844007)
('ff987a60d68f5bbbefe97f8c5062711a', 0.0004986337435426931)
('753bce7ac492e6c95b07b85f8877cf5a', 0.0004786883938009853)
('7b0f1b0815292c0a550fabd5eeadc011', 0.00045874304405927767)
0


0

In [323]:
recommend_mk3_manual('a18f526db904f8f2ee4c8d178bfc818c', debug=True)

[[  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]
 [  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]
 [  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]
 ..., 
 [  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]
 [  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]
 [  1.99453498e-05   3.39070946e-04   3.98906996e-05 ...,   1.00000000e+00
    8.16810257e-07   0.00000000e+00]]
('1df5a0fbce075d47107feeb8d65a138b', 0.27777777777777779)
('7db05f44445f307fd8157d7bc5e4db51', 0.27777777777777779)
('47c701ff962c505d385bfd251f061c42', 0.27777777777777779)
('68d0fcae54011b2f7d96f82ef4b0af2c', 0.27777777777777779)
('bd03a2df7386c06c1f7b3a3a8661ece4', 0.27777777777777779)
('4740836db6ce4b2144d81fd6a9a7

['1df5a0fbce075d47107feeb8d65a138b',
 '7db05f44445f307fd8157d7bc5e4db51',
 '47c701ff962c505d385bfd251f061c42',
 '68d0fcae54011b2f7d96f82ef4b0af2c',
 'bd03a2df7386c06c1f7b3a3a8661ece4',
 '4740836db6ce4b2144d81fd6a9a796c9',
 '4bef82fe1a89bc190e1d2b0e2c4ac8b3',
 '2e5a3f27c8799b2455fc99bd33fb396a',
 '49587ef465a58c65439506103faa210c',
 '85fca0881e606136eb7637b830850c84']

In [238]:
list(test_user_to_items.items())[7]

('a18f526db904f8f2ee4c8d178bfc818c', {'7a040062b953a9a3e7fb1545def5773f'})

In [42]:
def recommend_dummy(user, n_best = 10):
    item_similarities = []
    for item in all_items:
        #пропустим те фильмы, которые пользователь уже просмотрел, если нас об этом попросили
        if item in user_to_items[user]: continue
        item_similarities.append(item)
         
    random.shuffle(item_similarities)
    #вернём n_best наиболее пригодных
    #print(items_sorted[:n_best])
    return item_similarities[:n_best]

# Оценка качества - map@k

In [45]:
check_quality(recommend_dummy,10,200)

0 / 200
100 / 200
AP@10 = 9.259259259259259e-05


In [324]:
check_quality(recommend_mk3_manual,10, 200)

0 / 200
100 / 200
AP@10 = 0.0


In [148]:
check_quality(generic_recommend(user_film_mk1),200, 200)

0 / 200
100 / 200
AP@200 = 0.0020925098584587487


In [130]:
check_quality(generic_recommend(user_film_mk2),200,200)

0 / 200
100 / 200
AP@100 = 0.007631459343212682


In [132]:
check_quality(recommend,100, 200)

0 / 200
100 / 200
AP@100 = 0.003063415596733874


In [49]:
len(test_user_to_items)

2732

In [127]:
#сколько рекоммендаций рассматриваем
def check_quality(function, K = 10, max_n_users = len(test_user_to_items)):
    APatK_per_user = []
    user_list = list(test_user_to_items.keys())[:max_n_users]
    
    for i, user in enumerate(user_list):
        #фильмы, которые пользователю на самом деле нравятся
        test_items = test_user_to_items[user]

        #Выдать топ-K рекоммендаций
        recommendation_list = function(user,n_best=K)
        #Посчитать ap@k
        user_APatK = apk(test_items, recommendation_list,k=K)

        #и сложить в коробку
        APatK_per_user.append(user_APatK)

        #Progress bar
        if i % 100 ==0:
            print(i,'/',max_n_users)

        if i > max_n_users:
            break

    print('AP@{} = {}'.format(K, sum(APatK_per_user)/len(APatK_per_user)))


# Notes
* Кроме качества рекоммендаций, map@k ещё зависит от доли тестовой выборки, фильтрации и от самого K. Сравнивать качество разных алгоритмов имеет смысл только при одинаковом K и тестовой выборке.
* Давать полезные рекоммендации пользователям с малым числом просмотров тоже можно: например, можно выдавать наиболее популярные в целом фильмы.
* Разделение на обучение/тест честнее делать на по времени: первые 70% (например) лайков в обучение, остальные в тест. Это ближе к реальной жизни, когда вы сначала обучаете модель на логах, а потом применяете на новых сессиях пользователей.