diff --git a/Makefile b/Makefile index 8d3186f..f9b5f7b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ flake8: pip install -U flake8 - flake8 + flake8 pyClickModels isort: pip install -U isort @@ -30,4 +30,4 @@ publish: sh ./scripts/build_wheels.sh #twine upload --repository testpypi dist/* twine upload dist/* - rm -fr build dist .egg *.egg-info + #rm -fr build dist .egg *.egg-info diff --git a/pyClickModels/DBN.pyx b/pyClickModels/DBN.pyx index bf97418..ec2941f 100644 --- a/pyClickModels/DBN.pyx +++ b/pyClickModels/DBN.pyx @@ -1,22 +1,24 @@ # cython: linetrace=True -import os -from glob import glob import gzip +import os import time +from glob import glob + import ujson -from libcpp.vector cimport vector -from libcpp.unordered_map cimport unordered_map -from libcpp.string cimport string -from libc.stdlib cimport rand, RAND_MAX, srand -from libc.time cimport time as ctime + from cython.operator cimport dereference, postincrement -from pyClickModels.jsonc cimport(json_object, json_tokener_parse, - json_object_object_get_ex, json_object_get_string, - lh_table, lh_entry, json_object_array_length, - json_object_array_get_idx, json_object_get_int, - json_object_put) +from libc.stdlib cimport RAND_MAX, rand, srand +from libc.time cimport time as ctime +from libcpp.string cimport string +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector +from pyClickModels.jsonc cimport (json_object, json_object_array_get_idx, + json_object_array_length, + json_object_get_int, json_object_get_string, + json_object_object_get_ex, json_object_put, + json_tokener_parse, lh_entry, lh_table) # Start by setting the seed for the random values required for initalizing the DBN # parameters. @@ -125,12 +127,14 @@ cdef class Factor: result *= (1 - self.gamma) * (1 - self.cr) else: result *= self.gamma * (1 - self.cr) + # Compute P(C_{>r},P_{>r} | E_{r+1}) if not z: if self.last_r >= self.r + 1: return 0 else: if self.r < self.cp_vector_given_e[0].size(): result *= self.cp_vector_given_e[0][self.r] + # P(E_r=x | Cr}, P_{>r} | E_{r+1}) + This is equation (25) from blog post: + + https://towardsdatascience.com/how-to-extract-relevance-from-clickstream-data-2a870df219fb + Args ---- clickstream: *json_object @@ -624,13 +637,10 @@ cdef class DBNModel(): # Subtract 1 as E_{r+1} is defined up to r - 1 documents for r in range(total_docs - 1): - e_r_vector_given_CP = self.build_e_r_vector_given_CP(clickstream, r + 1, query) - cp_vector_given_e[r] = self.compute_cp_p(clickstream, r + 1, query, &e_r_vector_given_CP, cr_dict) - return cp_vector_given_e cdef int get_last_r(self, json_object *clickstream, const char *event=b'click'): @@ -868,7 +878,7 @@ cdef class DBNModel(): json_object_object_get_ex(doc_data, b'purchase', &tmp) purchase = json_object_get_int(tmp) - alpha = self.get_param(b'gamma', query, &doc)[0] + alpha = self.get_param(b'alpha', query, &doc)[0] sigma = self.get_param(b'sigma', query, &doc)[0] gamma = self.get_param(b'gamma')[0] @@ -887,6 +897,7 @@ cdef class DBNModel(): e_r_vector_given_CP, cp_vector_given_e ) + # Loop through all possible values of x, y and z, where each is an integer # boolean. for i in range(2): @@ -894,8 +905,11 @@ cdef class DBNModel(): for k in range(2): ESS_denominator += factor.compute_factor(i, j, k) - ESS_0 = factor.compute_factor(1, 0, 0) / ESS_denominator - ESS_1 = factor.compute_factor(1, 0, 1) / ESS_denominator + if not ESS_denominator: + ESS_0, ESS_1 = 0, 0 + else: + ESS_0 = factor.compute_factor(1, 0, 0) / ESS_denominator + ESS_1 = factor.compute_factor(1, 0, 1) / ESS_denominator tmp_gamma_param[0][0] += ESS_1 tmp_gamma_param[0][1] += ESS_0 + ESS_1 diff --git a/pyClickModels/__version__.py b/pyClickModels/__version__.py index 6bb6267..d18f409 100644 --- a/pyClickModels/__version__.py +++ b/pyClickModels/__version__.py @@ -1,3 +1 @@ -VERSION = (0, 0, 1) - -__version__ = '.'.join([str(e) for e in VERSION]) +__version__ = '0.0.2' diff --git a/scripts/build_wheels.sh b/scripts/build_wheels.sh index f984090..f7f6a8b 100755 --- a/scripts/build_wheels.sh +++ b/scripts/build_wheels.sh @@ -1,10 +1,12 @@ docker run -v $(pwd):/pyClickModels quay.io/pypa/manylinux1_x86_64 sh -c ''' +yum update yum install -y json-c-devel cd /pyClickModels for PYVER in /opt/python/*/bin/; do if [[ $PYVER != *"27"* ]]; then + "${PYVER}/pip" install -U pip "${PYVER}/pip" install -U setuptools "${PYVER}/pip" install -r requirements.txt "${PYVER}/python" setup.py sdist bdist_wheel diff --git a/setup.py b/setup.py index a92e9a7..e9e1bff 100644 --- a/setup.py +++ b/setup.py @@ -2,13 +2,13 @@ import os import sys -import Cython.Compiler.Options from codecs import open -from Cython.Distutils import build_ext from setuptools import setup -from Cython.Build import cythonize -from distutils.extension import Extension from setuptools.command.test import test as TestCommand +from distutils.extension import Extension +import Cython.Compiler.Options +from Cython.Distutils import build_ext +from Cython.Build import cythonize here = os.path.abspath(os.path.dirname(__file__)) diff --git a/tests/conftest.py b/tests/conftest.py index 6c32d07..b3752c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -82,24 +82,3 @@ def build_DBN_test_data(users=10, docs=10, queries=2): for row in final_result[half_results:]: f.write(json.dumps(row).encode() + '\n'.encode()) return persistence, params, tmp_folder - - -@pytest.fixture -def sessions(): - sessions = [ - { - 'sessionID': [ - {"doc": "doc0", "click": 0, "purchase": 0}, - {"doc": "doc1", "click": 1, "purchase": 0}, - {"doc": "doc2", "click": 1, "purchase": 1} - ] - }, - { - 'sessionID': [ - {"doc": "doc0", "click": 0, "purchase": 0}, - {"doc": "doc1", "click": 1, "purchase": 0} - ] - }, - - ] - return sessions diff --git a/tests/fixtures/eighty_skus/judgments.gz b/tests/fixtures/eighty_skus/judgments.gz new file mode 100644 index 0000000..d6627ae Binary files /dev/null and b/tests/fixtures/eighty_skus/judgments.gz differ diff --git a/tests/fixtures/null_test/judgments_test_null.gz b/tests/fixtures/null_test/judgments_test_null.gz new file mode 100644 index 0000000..563466f Binary files /dev/null and b/tests/fixtures/null_test/judgments_test_null.gz differ diff --git a/tests/test_cy_DBN.pyx b/tests/test_cy_DBN.pyx index 3353f85..bf5a36b 100644 --- a/tests/test_cy_DBN.pyx +++ b/tests/test_cy_DBN.pyx @@ -1,16 +1,23 @@ +import gzip import tempfile + import ujson -import gzip + +from cython.operator cimport dereference, postincrement from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector -from cython.operator cimport dereference, postincrement + from pyClickModels.DBN cimport DBNModel, Factor + from pyClickModels.DBN import DBN -from pyClickModels.jsonc cimport(json_object, json_tokener_parse, - json_object_get_object, lh_table, json_object_put) + +from pyClickModels.jsonc cimport (json_object, json_object_get_object, + json_object_put, json_tokener_parse, + lh_table) + from conftest import build_DBN_test_data -from numpy.testing import assert_almost_equal, assert_allclose +from numpy.testing import assert_allclose, assert_almost_equal ctypedef unordered_map[string, unordered_map[string, float]] dbn_param @@ -49,7 +56,6 @@ cdef bint test_fit(): # it = model.alpha_params.begin() while(it != model.alpha_params.end()): - # prints keys # print(dereference(it).first) query = (dereference(it).first) dquery = extract_keys(query) @@ -1886,6 +1892,22 @@ cdef bint test_export_judgments(): return True +cdef bint test_not_null_converence(): + cdef: + DBNModel model = DBN() + + model.fit('tests/fixtures/null_test', iters=1) + return True + + +cdef bint test_long_list_null_converence(): + cdef: + DBNModel model = DBN() + + model.fit('tests/fixtures/eighty_skus', iters=2) + return True + + cpdef run_tests(): assert test_get_search_context_string() assert test_compute_cr() @@ -1906,25 +1928,29 @@ cpdef run_tests(): assert test_update_gamma_param() assert test_fit() assert test_export_judgments() + assert test_not_null_converence() + assert test_long_list_null_converence() if __name__ == '__main__': - assert test_get_search_context_string() - assert test_compute_cr() - assert test_get_param() - assert test_build_e_r_vector(&alpha_params, &sigma_params, &gamma_param) - assert test_build_X_r_vector(&alpha_params, &sigma_params, &gamma_param) - assert test_build_e_r_vector_given_CP(&alpha_params, &sigma_params, &gamma_param) - assert test_build_cp_p(&alpha_params) - assert test_build_CP_vector_given_e(&alpha_params, &sigma_params, &gamma_param) - assert test_get_last_r() - assert test_update_tmp_alpha(&alpha_params, &sigma_params, &gamma_param) - assert test_update_tmp_sigma(&alpha_params, &sigma_params, &gamma_param) - assert test_compute_factor_last_click_lower_than_r() - assert test_compute_factor_last_click_higher_than_r() - assert test_update_tmp_gamma() - assert test_update_alpha_params() - assert test_update_sigma_params() - assert test_update_gamma_param() - assert test_fit() - assert test_export_judgments() + #assert test_get_search_context_string() + #assert test_compute_cr() + #assert test_get_param() + #assert test_build_e_r_vector(&alpha_params, &sigma_params, &gamma_param) + #assert test_build_X_r_vector(&alpha_params, &sigma_params, &gamma_param) + #assert test_build_e_r_vector_given_CP(&alpha_params, &sigma_params, &gamma_param) + #assert test_build_cp_p(&alpha_params) + #assert test_build_CP_vector_given_e(&alpha_params, &sigma_params, &gamma_param) + #assert test_get_last_r() + #assert test_update_tmp_alpha(&alpha_params, &sigma_params, &gamma_param) + #assert test_update_tmp_sigma(&alpha_params, &sigma_params, &gamma_param) + #assert test_compute_factor_last_click_lower_than_r() + #assert test_compute_factor_last_click_higher_than_r() + #assert test_update_tmp_gamma() + #assert test_update_alpha_params() + #assert test_update_sigma_params() + #assert test_update_gamma_param() + #assert test_fit() + #assert test_export_judgments() + #assert test_not_null_converence() + pass