Skip to content

Commit

Permalink
Cosine full vector implementation, as discussed in NicolasHug#135
Browse files Browse the repository at this point in the history
  • Loading branch information
ODemidenko committed Feb 5, 2018
1 parent fa85c0d commit df067c0
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 7 deletions.
3 changes: 3 additions & 0 deletions doc/source/prediction_algorithms.rst
Expand Up @@ -130,6 +130,9 @@ argument is a dictionary with the following (all optional) keys:
``'False'``) for the similarity not to be zero. Simply put, if
:math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The
same goes for items.
- ``'common_ratings_only'``: Determines whether only common user/item ratings are
taken into account or all the full rating vectors are considered
(only relevant for cosine-based similraty). Default is True.
- ``'shrinkage'``: Shrinkage parameter to apply (only relevant for
:func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity).
Default is 100.
Expand Down
4 changes: 4 additions & 0 deletions surprise/prediction_algorithms/algo_base.py
Expand Up @@ -285,6 +285,10 @@ def compute_similarities(self):
bx, by = bi, bu

args += [self.trainset.global_mean, bx, by, shrinkage]
elif name == 'cosine':
common_ratings_only = self.sim_options.get('common_ratings_only',
True)
args += [common_ratings_only]

try:
print('Computing the {0} similarity matrix...'.format(name))
Expand Down
101 changes: 98 additions & 3 deletions surprise/similarities.pyx
Expand Up @@ -25,11 +25,10 @@ from six.moves import range
from six import iteritems


def cosine(n_x, yr, min_support):
def cosine(n_x, yr, min_support, common_ratings_only=True):
"""Compute the cosine similarity between all pairs of users (or items).
Only **common** users (or items) are taken into account. The cosine
similarity is defined as:
The cosine similarity is defined as:
.. math::
\\text{cosine_sim}(u, v) = \\frac{
Expand All @@ -52,8 +51,20 @@ def cosine(n_x, yr, min_support):
For details on cosine similarity, see on `Wikipedia
<https://en.wikipedia.org/wiki/Cosine_similarity#Definition>`__.
Depending on ``common_ratings_only`` field of ``sim_options``
only common users (or items) are taken into account, or full rating
vectors (default: True).
"""

if common_ratings_only:
return cosine_common_ratings_only(n_x, yr, min_support)
else:
return cosine_full_rating_vectors(n_x, yr, min_support)


def cosine_common_ratings_only(n_x, yr, min_support):

# sum (r_xy * r_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# number of common ys
Expand All @@ -78,6 +89,90 @@ def cosine(n_x, yr, min_support):
for y, y_ratings in iteritems(yr):
for xi, ri in y_ratings:
for xj, rj in y_ratings:
freq[xi, xj] += 1
prods[xi, xj] += ri * rj
sqi[xi, xj] += ri ** 2
sqj[xi, xj] += rj ** 2

for xi in range(n_x):
sim[xi, xi] = 1
for xj in range(xi + 1, n_x):
if freq[xi, xj] < min_sprt:
sim[xi, xj] = 0
else:
denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
sim[xi, xj] = prods[xi, xj] / denum

sim[xj, xi] = sim[xi, xj]

return sim


def cosine_full_rating_vectors(n_x, yr, min_support):

# sum (r_xy * r_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# number of common ys
cdef np.ndarray[np.int_t, ndim=2] freq
# sum (r_xy ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqi
# sum (r_x'y ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqj
# the similarity matrix
cdef np.ndarray[np.double_t, ndim=2] sim

cdef int xi, xj
cdef double ri, rj
cdef int min_sprt = min_support

prods = np.zeros((n_x, n_x), np.double)
freq = np.zeros((n_x, n_x), np.int)
sqi = np.zeros((n_x, n_x), np.double)
sqj = np.zeros((n_x, n_x), np.double)
sim = np.zeros((n_x, n_x), np.double)

for y, y_ratings in iteritems(yr):

# yr_ratings data structure is sparse. But for cosine similarity it is
# necessary to obtain all pairs, substituting missing ratings for 0.
# Implementation:
# Iterate through the range of x-indexes, taking 0-rating for each
# index unless this index is actually present in the iter
sorted_y_ratings = sorted(y_ratings, key=lambda x: x[0])
xi_iter = iter(sorted_y_ratings)
try:
xi_non_missing, ri_non_missing = next(xi_iter)
except StopIteration:
xi_non_missing = n_x
for xi_all in range(n_x):
if xi_all < xi_non_missing:
xi = xi_all
ri = 0
else:
xi = xi_non_missing
ri = ri_non_missing
try:
xi_non_missing, ri_non_missing = next(xi_iter)
except StopIteration:
xi_non_missing = n_x

xj_iter = iter(sorted_y_ratings)
try:
xj_non_missing, rj_non_missing = next(xj_iter)
except StopIteration:
xj_non_missing = n_x
for xj_all in range(n_x):
if xj_all < xj_non_missing:
xj = xj_all
rj = 0
else:
xj = xj_non_missing
rj = rj_non_missing
try:
xj_non_missing, rj_non_missing = next(xj_iter)
except StopIteration:
xj_non_missing = n_x

freq[xi, xj] += 1
prods[xi, xj] += ri * rj
sqi[xi, xj] += ri**2
Expand Down
52 changes: 48 additions & 4 deletions tests/test_similarities.py
Expand Up @@ -12,11 +12,11 @@

n_x = 8
yr_global = {
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1: [(0, 4), (1, 4), (2, 4), ], # noqa
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
}


Expand Down Expand Up @@ -48,7 +48,51 @@ def test_cosine_sim():
# cosine sim is necessarily 1
assert sim[3, 4] == 1

# pairs of users (0, 3) have no common items
# pairs of users (0, 3) have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

# check for float point support and computation correctness
dot_product56 = 1 * 1.5 + 3 * 3.5 + 2 * 2.5
assert sim[5, 6] == (dot_product56 /
((1 ** 2 + 3 ** 2 + 2 ** 2) *
(1.5 ** 2 + 3.5 ** 2 + 2.5 ** 2)) ** 0.5
)

# ensure min_support is taken into account. Only users 1 and 2 have more
# than 4 common ratings.
sim = sims.cosine(n_x, yr, min_support=4)
for i in range(n_x):
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0


def test_cosine_full_vectors_sim():
"""Tests for the cosine similarity."""

yr = yr_global.copy()

# # shuffle every rating list, to ensure the order in which ratings are
# # processed does not matter (it's important because it used to be error
# # prone when we were using itertools.combinations)
# for _, ratings in yr.items():
# random.shuffle(ratings)

sim = sims.cosine(n_x, yr, min_support=1, common_ratings_only=False)

# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
assert sim[xi, xj] == sim[xj, xi]
assert 0 <= sim[xi, xj] <= 1

# users 0, 1 and 2 have different ratings when non-common items considered
assert sim[0, 1] < 1
assert sim[0, 2] < 1

# pairs of users (0, 3) and (0,4) have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

Expand Down

0 comments on commit df067c0

Please sign in to comment.