Skip to content

Commit

Permalink
Fix null convergence (#7)
Browse files Browse the repository at this point in the history
* Add treatment for null convergence due precision limits.

* add OS update to build.

* update pip version on build.
  • Loading branch information
WillianFuks committed Feb 17, 2021
1 parent 09215c7 commit 0aa8a73
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 76 deletions.
4 changes: 2 additions & 2 deletions Makefile
Expand Up @@ -2,7 +2,7 @@

flake8:
pip install -U flake8
flake8
flake8 pyClickModels

isort:
pip install -U isort
Expand Down Expand Up @@ -30,4 +30,4 @@ publish:
sh ./scripts/build_wheels.sh
#twine upload --repository testpypi dist/*
twine upload dist/*
rm -fr build dist .egg *.egg-info
#rm -fr build dist .egg *.egg-info
56 changes: 35 additions & 21 deletions pyClickModels/DBN.pyx
@@ -1,22 +1,24 @@
# cython: linetrace=True

import os
from glob import glob
import gzip
import os
import time
from glob import glob

import ujson
from libcpp.vector cimport vector
from libcpp.unordered_map cimport unordered_map
from libcpp.string cimport string
from libc.stdlib cimport rand, RAND_MAX, srand
from libc.time cimport time as ctime

from cython.operator cimport dereference, postincrement
from pyClickModels.jsonc cimport(json_object, json_tokener_parse,
json_object_object_get_ex, json_object_get_string,
lh_table, lh_entry, json_object_array_length,
json_object_array_get_idx, json_object_get_int,
json_object_put)
from libc.stdlib cimport RAND_MAX, rand, srand
from libc.time cimport time as ctime
from libcpp.string cimport string
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector

from pyClickModels.jsonc cimport (json_object, json_object_array_get_idx,
json_object_array_length,
json_object_get_int, json_object_get_string,
json_object_object_get_ex, json_object_put,
json_tokener_parse, lh_entry, lh_table)

# Start by setting the seed for the random values required for initalizing the DBN
# parameters.
Expand Down Expand Up @@ -125,12 +127,14 @@ cdef class Factor:
result *= (1 - self.gamma) * (1 - self.cr)
else:
result *= self.gamma * (1 - self.cr)
# Compute P(C_{>r},P_{>r} | E_{r+1})
if not z:
if self.last_r >= self.r + 1:
return 0
else:
if self.r < self.cp_vector_given_e[0].size():
result *= self.cp_vector_given_e[0][self.r]
# P(E_r=x | C<r, P<r)
result *= (self.e_r_vector_given_CP[0][self.r] if x else
1 - self.e_r_vector_given_CP[0][self.r])
return result
Expand Down Expand Up @@ -414,6 +418,7 @@ cdef class DBNModel():
# Probability of clicks at positions greater than the last document in results
# page is zero.
X_r_vector[total_docs] = 0
gamma = self.get_param(b'gamma')

for r in range(total_docs - 1, -1, -1):
json_object_object_get_ex(
Expand All @@ -423,7 +428,6 @@ cdef class DBNModel():
)
doc = json_object_get_string(tmp)
alpha = self.get_param(b'alpha', query, &doc)
gamma = self.get_param(b'gamma')

X_r_1 = X_r_vector[r + 1]
X_r = alpha[0] + (1 - alpha[0]) * gamma[0] * X_r_1
Expand All @@ -438,6 +442,10 @@ cdef class DBNModel():
Mathematically: P(E_r = 1 | C_{<r}, P_{<r})
This is discussed in equation (24) in the blog post:
https://towardsdatascience.com/how-to-extract-relevance-from-clickstream-data-2a870df219fb
Args
----
clickstream: *json_object
Expand Down Expand Up @@ -465,8 +473,10 @@ cdef class DBNModel():
# position r + 1 will be required later so add +1 in computation
vector[float] e_r_vector_given_CP = vector[float](total_docs + 1 - idx, 0.0)

# First document has 100% of being Examined regardless of clicks or purchases.
# First document has 100% chance of being Examined regardless of clicks or
# purchases.
e_r_vector_given_CP[0] = 1
gamma = self.get_param(b'gamma')

for r in range(idx, total_docs):
json_object_object_get_ex(
Expand All @@ -492,7 +502,6 @@ cdef class DBNModel():

alpha = self.get_param(b'alpha', query, &doc)
sigma = self.get_param(b'sigma', query, &doc)
gamma = self.get_param(b'gamma')

if purchase:
return e_r_vector_given_CP
Expand Down Expand Up @@ -602,6 +611,10 @@ cdef class DBNModel():
P(C_{>r}, P_{>r} | E_{r+1})
This is equation (25) from blog post:
https://towardsdatascience.com/how-to-extract-relevance-from-clickstream-data-2a870df219fb
Args
----
clickstream: *json_object
Expand All @@ -624,13 +637,10 @@ cdef class DBNModel():

# Subtract 1 as E_{r+1} is defined up to r - 1 documents
for r in range(total_docs - 1):

e_r_vector_given_CP = self.build_e_r_vector_given_CP(clickstream, r + 1,
query)

cp_vector_given_e[r] = self.compute_cp_p(clickstream, r + 1, query,
&e_r_vector_given_CP, cr_dict)

return cp_vector_given_e

cdef int get_last_r(self, json_object *clickstream, const char *event=b'click'):
Expand Down Expand Up @@ -868,7 +878,7 @@ cdef class DBNModel():
json_object_object_get_ex(doc_data, b'purchase', &tmp)
purchase = json_object_get_int(tmp)

alpha = self.get_param(b'gamma', query, &doc)[0]
alpha = self.get_param(b'alpha', query, &doc)[0]
sigma = self.get_param(b'sigma', query, &doc)[0]
gamma = self.get_param(b'gamma')[0]

Expand All @@ -887,15 +897,19 @@ cdef class DBNModel():
e_r_vector_given_CP,
cp_vector_given_e
)

# Loop through all possible values of x, y and z, where each is an integer
# boolean.
for i in range(2):
for j in range(2):
for k in range(2):
ESS_denominator += factor.compute_factor(i, j, k)

ESS_0 = factor.compute_factor(1, 0, 0) / ESS_denominator
ESS_1 = factor.compute_factor(1, 0, 1) / ESS_denominator
if not ESS_denominator:
ESS_0, ESS_1 = 0, 0
else:
ESS_0 = factor.compute_factor(1, 0, 0) / ESS_denominator
ESS_1 = factor.compute_factor(1, 0, 1) / ESS_denominator

tmp_gamma_param[0][0] += ESS_1
tmp_gamma_param[0][1] += ESS_0 + ESS_1
Expand Down
4 changes: 1 addition & 3 deletions pyClickModels/__version__.py
@@ -1,3 +1 @@
VERSION = (0, 0, 1)

__version__ = '.'.join([str(e) for e in VERSION])
__version__ = '0.0.2'
2 changes: 2 additions & 0 deletions scripts/build_wheels.sh
@@ -1,10 +1,12 @@
docker run -v $(pwd):/pyClickModels quay.io/pypa/manylinux1_x86_64 sh -c '''
yum update
yum install -y json-c-devel
cd /pyClickModels
for PYVER in /opt/python/*/bin/; do
if [[ $PYVER != *"27"* ]]; then
"${PYVER}/pip" install -U pip
"${PYVER}/pip" install -U setuptools
"${PYVER}/pip" install -r requirements.txt
"${PYVER}/python" setup.py sdist bdist_wheel
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Expand Up @@ -2,13 +2,13 @@

import os
import sys
import Cython.Compiler.Options
from codecs import open
from Cython.Distutils import build_ext
from setuptools import setup
from Cython.Build import cythonize
from distutils.extension import Extension
from setuptools.command.test import test as TestCommand
from distutils.extension import Extension
import Cython.Compiler.Options
from Cython.Distutils import build_ext
from Cython.Build import cythonize


here = os.path.abspath(os.path.dirname(__file__))
Expand Down
21 changes: 0 additions & 21 deletions tests/conftest.py
Expand Up @@ -82,24 +82,3 @@ def build_DBN_test_data(users=10, docs=10, queries=2):
for row in final_result[half_results:]:
f.write(json.dumps(row).encode() + '\n'.encode())
return persistence, params, tmp_folder


@pytest.fixture
def sessions():
sessions = [
{
'sessionID': [
{"doc": "doc0", "click": 0, "purchase": 0},
{"doc": "doc1", "click": 1, "purchase": 0},
{"doc": "doc2", "click": 1, "purchase": 1}
]
},
{
'sessionID': [
{"doc": "doc0", "click": 0, "purchase": 0},
{"doc": "doc1", "click": 1, "purchase": 0}
]
},

]
return sessions
Binary file added tests/fixtures/eighty_skus/judgments.gz
Binary file not shown.
Binary file added tests/fixtures/null_test/judgments_test_null.gz
Binary file not shown.
76 changes: 51 additions & 25 deletions tests/test_cy_DBN.pyx
@@ -1,16 +1,23 @@
import gzip
import tempfile

import ujson
import gzip

from cython.operator cimport dereference, postincrement
from libcpp.string cimport string
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from cython.operator cimport dereference, postincrement

from pyClickModels.DBN cimport DBNModel, Factor

from pyClickModels.DBN import DBN
from pyClickModels.jsonc cimport(json_object, json_tokener_parse,
json_object_get_object, lh_table, json_object_put)

from pyClickModels.jsonc cimport (json_object, json_object_get_object,
json_object_put, json_tokener_parse,
lh_table)

from conftest import build_DBN_test_data
from numpy.testing import assert_almost_equal, assert_allclose
from numpy.testing import assert_allclose, assert_almost_equal

ctypedef unordered_map[string, unordered_map[string, float]] dbn_param

Expand Down Expand Up @@ -49,7 +56,6 @@ cdef bint test_fit():

# it = model.alpha_params.begin()
while(it != model.alpha_params.end()):
# prints keys
# print(dereference(it).first)
query = (dereference(it).first)
dquery = extract_keys(query)
Expand Down Expand Up @@ -1886,6 +1892,22 @@ cdef bint test_export_judgments():
return True


cdef bint test_not_null_converence():
cdef:
DBNModel model = DBN()

model.fit('tests/fixtures/null_test', iters=1)
return True


cdef bint test_long_list_null_converence():
cdef:
DBNModel model = DBN()

model.fit('tests/fixtures/eighty_skus', iters=2)
return True


cpdef run_tests():
assert test_get_search_context_string()
assert test_compute_cr()
Expand All @@ -1906,25 +1928,29 @@ cpdef run_tests():
assert test_update_gamma_param()
assert test_fit()
assert test_export_judgments()
assert test_not_null_converence()
assert test_long_list_null_converence()


if __name__ == '__main__':
assert test_get_search_context_string()
assert test_compute_cr()
assert test_get_param()
assert test_build_e_r_vector(&alpha_params, &sigma_params, &gamma_param)
assert test_build_X_r_vector(&alpha_params, &sigma_params, &gamma_param)
assert test_build_e_r_vector_given_CP(&alpha_params, &sigma_params, &gamma_param)
assert test_build_cp_p(&alpha_params)
assert test_build_CP_vector_given_e(&alpha_params, &sigma_params, &gamma_param)
assert test_get_last_r()
assert test_update_tmp_alpha(&alpha_params, &sigma_params, &gamma_param)
assert test_update_tmp_sigma(&alpha_params, &sigma_params, &gamma_param)
assert test_compute_factor_last_click_lower_than_r()
assert test_compute_factor_last_click_higher_than_r()
assert test_update_tmp_gamma()
assert test_update_alpha_params()
assert test_update_sigma_params()
assert test_update_gamma_param()
assert test_fit()
assert test_export_judgments()
#assert test_get_search_context_string()
#assert test_compute_cr()
#assert test_get_param()
#assert test_build_e_r_vector(&alpha_params, &sigma_params, &gamma_param)
#assert test_build_X_r_vector(&alpha_params, &sigma_params, &gamma_param)
#assert test_build_e_r_vector_given_CP(&alpha_params, &sigma_params, &gamma_param)
#assert test_build_cp_p(&alpha_params)
#assert test_build_CP_vector_given_e(&alpha_params, &sigma_params, &gamma_param)
#assert test_get_last_r()
#assert test_update_tmp_alpha(&alpha_params, &sigma_params, &gamma_param)
#assert test_update_tmp_sigma(&alpha_params, &sigma_params, &gamma_param)
#assert test_compute_factor_last_click_lower_than_r()
#assert test_compute_factor_last_click_higher_than_r()
#assert test_update_tmp_gamma()
#assert test_update_alpha_params()
#assert test_update_sigma_params()
#assert test_update_gamma_param()
#assert test_fit()
#assert test_export_judgments()
#assert test_not_null_converence()
pass

0 comments on commit 0aa8a73

Please sign in to comment.