From 9a71e633187e6cf92d68e64255f25ef5a5632788 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 22 Jul 2025 22:49:33 -0400 Subject: [PATCH 1/7] Fix NumPy 2.1.0 random seed overflow issue Ensure random seeds are always non-negative by wrapping with abs() and int(). This prevents ValueError when large entity IDs cause integer overflow in seed calculation with NumPy 2.1.0's stricter validation. The issue occurs in policyengine-uk tests where the random() function calculates seeds as id * 100 + count, which can overflow for large IDs. --- CHANGELOG.md | 9 ++++++++- changelog.yaml | 5 +++++ policyengine_core/commons/formulas.py | 2 +- setup.py | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d65f8e7..e2b41f85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - added support for python 3.13.0 - upgraded dependency to numpy 2.1.0 +## [3.18.1] - 2025-07-22 22:49:07 + +### Fixed + +- Fix NumPy 2.1.0 random seed overflow issue by ensuring seeds are always non-negative + ## [3.18.0] - 2025-07-22 20:16:08 ### Changed @@ -1027,7 +1033,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 -[3.19.0]: https://github.com/PolicyEngine/policyengine-core/compare/3.18.0...3.19.0 +[3.19.0]: https://github.com/PolicyEngine/policyengine-core/compare/3.18.1...3.19.0 +[3.18.1]: https://github.com/PolicyEngine/policyengine-core/compare/3.18.0...3.18.1 [3.18.0]: https://github.com/PolicyEngine/policyengine-core/compare/3.17.1...3.18.0 [3.17.1]: https://github.com/PolicyEngine/policyengine-core/compare/3.17.0...3.17.1 [3.17.0]: https://github.com/PolicyEngine/policyengine-core/compare/3.16.6...3.17.0 diff --git a/changelog.yaml b/changelog.yaml index 7f2e11f5..feaf6ef1 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -836,3 +836,8 @@ - added support for python 3.13.0 - upgraded dependency to numpy 2.1.0 date: 2025-07-23 00:48:51 +- bump: patch + changes: + fixed: + - Fix NumPy 2.1.0 random seed overflow issue by ensuring seeds are always non-negative + date: 2025-07-22 22:49:07 diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py index 7d316c36..660f3f1e 100644 --- a/policyengine_core/commons/formulas.py +++ b/policyengine_core/commons/formulas.py @@ -337,7 +337,7 @@ def random(population): values = np.array( [ np.random.default_rng( - seed=id * 100 + population.simulation.count_random_calls + seed=int(abs(id * 100 + population.simulation.count_random_calls)) ).random() for id in entity_ids ] diff --git a/setup.py b/setup.py index e71ff8e7..a993bc6d 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ setup( name="policyengine-core", - version="3.19.0", + version="3.19.1", author="PolicyEngine", author_email="hello@policyengine.org", classifiers=[ From d04815b9ecc6d4603610c277decc1bdd93847b8e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 22 Jul 2025 22:57:08 -0400 Subject: [PATCH 2/7] Apply Black formatting --- policyengine_core/commons/formulas.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/policyengine_core/commons/formulas.py b/policyengine_core/commons/formulas.py index 660f3f1e..34d89bea 100644 --- a/policyengine_core/commons/formulas.py +++ b/policyengine_core/commons/formulas.py @@ -337,7 +337,9 @@ def random(population): values = np.array( [ np.random.default_rng( - seed=int(abs(id * 100 + population.simulation.count_random_calls)) + seed=int( + abs(id * 100 + population.simulation.count_random_calls) + ) ).random() for id in entity_ids ] From 2040688fa16f25aba5ace470a5945cecab0dd0b1 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 23 Jul 2025 11:23:49 -0400 Subject: [PATCH 3/7] Fix NumPy random seed overflow and bump version to 3.19.1 --- changelog_entry.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..a17b363c 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Fix NumPy 2.1.0 random seed overflow issue by ensuring seeds are always non-negative \ No newline at end of file From 476ee6f2af71d152b2257fec7f105b7a29fc66ed Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 24 Jul 2025 10:52:04 -0400 Subject: [PATCH 4/7] Add tests for random seed overflow fix - Test handling of large entity IDs that would cause overflow - Test consistency of random values for same inputs - Test proper call count incrementing - Test handling of negative IDs - Test specific overflow scenario that would cause ValueError without fix --- tests/core/commons/test_random_seed.py | 150 +++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 tests/core/commons/test_random_seed.py diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py new file mode 100644 index 00000000..19d7d9b8 --- /dev/null +++ b/tests/core/commons/test_random_seed.py @@ -0,0 +1,150 @@ +"""Test the random function with large entity IDs to ensure no overflow.""" + +import numpy as np +import pytest +from unittest.mock import Mock +from policyengine_core.commons.formulas import random + + +class TestRandomSeed: + """Test random seed handling to prevent NumPy overflow errors.""" + + def test_random_with_large_entity_ids(self): + """Test that random() handles large entity IDs without overflow.""" + # Create a mock population with simulation + population = Mock() + population.simulation = Mock() + population.simulation.count_random_calls = 0 + population.entity = Mock() + population.entity.key = "person" + + # Mock the get_holder and get_known_periods + holder = Mock() + holder.get_known_periods.return_value = [] + population.simulation.get_holder.return_value = holder + population.simulation.default_calculation_period = Mock() + + # Test with very large entity IDs that would cause overflow + # if not handled properly + large_ids = np.array([ + np.iinfo(np.int64).max - 1000, # Very large positive ID + np.iinfo(np.int64).max // 2, # Large positive ID + 1234567890123456789, # Another large ID + ]) + + # Mock the population call to return large IDs + population.side_effect = lambda key, period: large_ids + + # This should not raise a ValueError about negative seeds + result = random(population) + + # Check that we got valid random values + assert isinstance(result, np.ndarray) + assert len(result) == len(large_ids) + assert all(0 <= val <= 1 for val in result) + + def test_random_seed_consistency(self): + """Test that random() produces consistent results for same inputs.""" + # Create mock population + population = Mock() + population.simulation = Mock() + population.simulation.count_random_calls = 0 + population.entity = Mock() + population.entity.key = "household" + + holder = Mock() + holder.get_known_periods.return_value = [] + population.simulation.get_holder.return_value = holder + population.simulation.default_calculation_period = Mock() + + # Use same IDs + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + # First call + result1 = random(population) + + # Reset count to simulate same conditions + population.simulation.count_random_calls = 0 + + # Second call with same conditions + result2 = random(population) + + # Results should be identical + np.testing.assert_array_equal(result1, result2) + + def test_random_increments_call_count(self): + """Test that random() increments the call counter.""" + population = Mock() + population.simulation = Mock() + population.simulation.count_random_calls = 0 + population.entity = Mock() + population.entity.key = "person" + + holder = Mock() + holder.get_known_periods.return_value = [] + population.simulation.get_holder.return_value = holder + population.simulation.default_calculation_period = Mock() + + ids = np.array([1, 2, 3]) + population.side_effect = lambda key, period: ids + + # First call + random(population) + assert population.simulation.count_random_calls == 1 + + # Second call + random(population) + assert population.simulation.count_random_calls == 2 + + def test_random_handles_negative_ids(self): + """Test that random() handles negative IDs properly.""" + population = Mock() + population.simulation = Mock() + population.simulation.count_random_calls = 0 + population.entity = Mock() + population.entity.key = "person" + + holder = Mock() + holder.get_known_periods.return_value = [] + population.simulation.get_holder.return_value = holder + population.simulation.default_calculation_period = Mock() + + # Include negative IDs + ids = np.array([-100, -1, 0, 1, 100]) + population.side_effect = lambda key, period: ids + + # Should handle negative IDs without errors + result = random(population) + + assert isinstance(result, np.ndarray) + assert len(result) == len(ids) + assert all(0 <= val <= 1 for val in result) + + def test_no_negative_seed_error_with_overflow(self): + """Test that seed calculation overflow doesn't cause negative seed error.""" + population = Mock() + population.simulation = Mock() + population.simulation.count_random_calls = 999999999 # Large count + population.entity = Mock() + population.entity.key = "person" + + holder = Mock() + holder.get_known_periods.return_value = [] + population.simulation.get_holder.return_value = holder + population.simulation.default_calculation_period = Mock() + + # Use the exact ID that would cause overflow in old implementation + # This ID when multiplied by 100 and added to count_random_calls + # would overflow int64 and become negative + overflow_id = np.array([np.iinfo(np.int64).max // 100]) + population.side_effect = lambda key, period: overflow_id + + # In the old implementation, this would raise: + # ValueError: Seed must be between 0 and 2**32 - 1 + # With the fix using abs(), it should work fine + result = random(population) + + assert isinstance(result, np.ndarray) + assert len(result) == 1 + assert 0 <= result[0] <= 1 \ No newline at end of file From 3c49ec30c0e054526906e95f4796555ae4d4a2c7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 24 Jul 2025 11:20:23 -0400 Subject: [PATCH 5/7] Apply Black formatting to test file --- tests/core/commons/test_random_seed.py | 68 +++++++++++++------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/tests/core/commons/test_random_seed.py b/tests/core/commons/test_random_seed.py index 19d7d9b8..e63abe3f 100644 --- a/tests/core/commons/test_random_seed.py +++ b/tests/core/commons/test_random_seed.py @@ -17,32 +17,34 @@ def test_random_with_large_entity_ids(self): population.simulation.count_random_calls = 0 population.entity = Mock() population.entity.key = "person" - + # Mock the get_holder and get_known_periods holder = Mock() holder.get_known_periods.return_value = [] population.simulation.get_holder.return_value = holder population.simulation.default_calculation_period = Mock() - + # Test with very large entity IDs that would cause overflow # if not handled properly - large_ids = np.array([ - np.iinfo(np.int64).max - 1000, # Very large positive ID - np.iinfo(np.int64).max // 2, # Large positive ID - 1234567890123456789, # Another large ID - ]) - + large_ids = np.array( + [ + np.iinfo(np.int64).max - 1000, # Very large positive ID + np.iinfo(np.int64).max // 2, # Large positive ID + 1234567890123456789, # Another large ID + ] + ) + # Mock the population call to return large IDs population.side_effect = lambda key, period: large_ids - + # This should not raise a ValueError about negative seeds result = random(population) - + # Check that we got valid random values assert isinstance(result, np.ndarray) assert len(result) == len(large_ids) assert all(0 <= val <= 1 for val in result) - + def test_random_seed_consistency(self): """Test that random() produces consistent results for same inputs.""" # Create mock population @@ -51,28 +53,28 @@ def test_random_seed_consistency(self): population.simulation.count_random_calls = 0 population.entity = Mock() population.entity.key = "household" - + holder = Mock() holder.get_known_periods.return_value = [] population.simulation.get_holder.return_value = holder population.simulation.default_calculation_period = Mock() - + # Use same IDs ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - + # First call result1 = random(population) - + # Reset count to simulate same conditions population.simulation.count_random_calls = 0 - + # Second call with same conditions result2 = random(population) - + # Results should be identical np.testing.assert_array_equal(result1, result2) - + def test_random_increments_call_count(self): """Test that random() increments the call counter.""" population = Mock() @@ -80,23 +82,23 @@ def test_random_increments_call_count(self): population.simulation.count_random_calls = 0 population.entity = Mock() population.entity.key = "person" - + holder = Mock() holder.get_known_periods.return_value = [] population.simulation.get_holder.return_value = holder population.simulation.default_calculation_period = Mock() - + ids = np.array([1, 2, 3]) population.side_effect = lambda key, period: ids - + # First call random(population) assert population.simulation.count_random_calls == 1 - + # Second call random(population) assert population.simulation.count_random_calls == 2 - + def test_random_handles_negative_ids(self): """Test that random() handles negative IDs properly.""" population = Mock() @@ -104,23 +106,23 @@ def test_random_handles_negative_ids(self): population.simulation.count_random_calls = 0 population.entity = Mock() population.entity.key = "person" - + holder = Mock() holder.get_known_periods.return_value = [] population.simulation.get_holder.return_value = holder population.simulation.default_calculation_period = Mock() - + # Include negative IDs ids = np.array([-100, -1, 0, 1, 100]) population.side_effect = lambda key, period: ids - + # Should handle negative IDs without errors result = random(population) - + assert isinstance(result, np.ndarray) assert len(result) == len(ids) assert all(0 <= val <= 1 for val in result) - + def test_no_negative_seed_error_with_overflow(self): """Test that seed calculation overflow doesn't cause negative seed error.""" population = Mock() @@ -128,23 +130,23 @@ def test_no_negative_seed_error_with_overflow(self): population.simulation.count_random_calls = 999999999 # Large count population.entity = Mock() population.entity.key = "person" - + holder = Mock() holder.get_known_periods.return_value = [] population.simulation.get_holder.return_value = holder population.simulation.default_calculation_period = Mock() - + # Use the exact ID that would cause overflow in old implementation # This ID when multiplied by 100 and added to count_random_calls # would overflow int64 and become negative overflow_id = np.array([np.iinfo(np.int64).max // 100]) population.side_effect = lambda key, period: overflow_id - + # In the old implementation, this would raise: # ValueError: Seed must be between 0 and 2**32 - 1 # With the fix using abs(), it should work fine result = random(population) - + assert isinstance(result, np.ndarray) assert len(result) == 1 - assert 0 <= result[0] <= 1 \ No newline at end of file + assert 0 <= result[0] <= 1 From 0874be08f6aa8a3c03ab6f40353a1d27678e3827 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 24 Jul 2025 12:13:39 -0400 Subject: [PATCH 6/7] Add GitHub token to smoke tests to prevent API rate limit errors - Uses the built-in GITHUB_TOKEN secret to authenticate API requests - Increases rate limit from 60/hour (unauthenticated) to 5000/hour - Prevents intermittent CI failures due to 403 errors when downloading data --- .github/workflows/pr.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9c862e0c..55aa9475 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -97,4 +97,5 @@ jobs: - name: Run smoke tests only run: python -m pytest -m smoke --reruns 2 --reruns-delay 5 -v -s env: - RUN_SMOKE_TESTS: "1" \ No newline at end of file + RUN_SMOKE_TESTS: "1" + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From 58c2fcabdd06d99801a50810289260fb88709fda Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 24 Jul 2025 12:24:06 -0400 Subject: [PATCH 7/7] Add comprehensive tests for between() and is_in() functions - Test between() with various inclusive options - Test is_in() with different data types and edge cases - Improve overall code coverage to pass codecov checks --- tests/core/commons/test_between_function.py | 79 +++++++++++++++++++++ tests/core/commons/test_is_in_function.py | 79 +++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tests/core/commons/test_between_function.py create mode 100644 tests/core/commons/test_is_in_function.py diff --git a/tests/core/commons/test_between_function.py b/tests/core/commons/test_between_function.py new file mode 100644 index 00000000..7ad05056 --- /dev/null +++ b/tests/core/commons/test_between_function.py @@ -0,0 +1,79 @@ +"""Test the between function from commons.formulas.""" + +import numpy as np +import pytest +from policyengine_core.commons.formulas import between + + +class TestBetweenFunction: + """Test the between function for checking if values are within bounds.""" + + def test_between_inclusive_both(self): + """Test between with both bounds inclusive (default).""" + values = np.array([1, 2, 3, 4, 5]) + result = between(values, 2, 4) + expected = np.array([False, True, True, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_inclusive_left(self): + """Test between with only left bound inclusive.""" + values = np.array([1, 2, 3, 4, 5]) + result = between(values, 2, 4, inclusive="left") + expected = np.array([False, True, True, False, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_inclusive_right(self): + """Test between with only right bound inclusive.""" + values = np.array([1, 2, 3, 4, 5]) + result = between(values, 2, 4, inclusive="right") + expected = np.array([False, False, True, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_inclusive_neither(self): + """Test between with neither bound inclusive.""" + values = np.array([1, 2, 3, 4, 5]) + result = between(values, 2, 4, inclusive="neither") + expected = np.array([False, False, True, False, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_with_floats(self): + """Test between with float values.""" + values = np.array([1.5, 2.5, 3.5, 4.5]) + result = between(values, 2.0, 4.0) + expected = np.array([False, True, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_with_negative_values(self): + """Test between with negative values.""" + values = np.array([-3, -2, -1, 0, 1, 2, 3]) + result = between(values, -2, 2) + expected = np.array([False, True, True, True, True, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_between_single_value(self): + """Test between with a single value.""" + value = 5 + assert between(value, 0, 10).item() == True + assert between(value, 0, 4).item() == False + assert between(value, 5, 10).item() == True + assert between(value, 5, 10, inclusive="left").item() == True + assert between(value, 5, 10, inclusive="neither").item() == False + + def test_between_edge_cases(self): + """Test between with edge cases.""" + # Empty array + values = np.array([]) + result = between(values, 0, 10) + assert len(result) == 0 + + # All values equal to bounds + values = np.array([5, 5, 5]) + result = between(values, 5, 5) + expected = np.array([True, True, True]) + np.testing.assert_array_equal(result, expected) + + # Bounds in reverse order (upper < lower) + values = np.array([1, 2, 3, 4, 5]) + result = between(values, 4, 2) # This should return all False + expected = np.array([False, False, False, False, False]) + np.testing.assert_array_equal(result, expected) diff --git a/tests/core/commons/test_is_in_function.py b/tests/core/commons/test_is_in_function.py new file mode 100644 index 00000000..ee724ffd --- /dev/null +++ b/tests/core/commons/test_is_in_function.py @@ -0,0 +1,79 @@ +"""Test the is_in function from commons.formulas.""" + +import numpy as np +import pytest +from policyengine_core.commons.formulas import is_in + + +class TestIsInFunction: + """Test the is_in function for checking membership.""" + + def test_is_in_basic(self): + """Test basic is_in functionality.""" + values = np.array([1, 2, 3, 4, 5]) + result = is_in(values, 2, 4) + expected = np.array([False, True, False, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_with_list(self): + """Test is_in with a list of targets.""" + values = np.array([1, 2, 3, 4, 5]) + result = is_in(values, [2, 4]) + expected = np.array([False, True, False, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_with_strings(self): + """Test is_in with string values.""" + values = np.array(["apple", "banana", "cherry", "date"]) + result = is_in(values, "banana", "date") + expected = np.array([False, True, False, True]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_with_mixed_types(self): + """Test is_in with mixed numeric types.""" + values = np.array([1.0, 2.0, 3.0, 4.0]) + result = is_in(values, 2, 4) # int targets, float values + expected = np.array([False, True, False, True]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_single_value(self): + """Test is_in with a single value.""" + value = 5 + assert is_in(value, 5) == True + assert is_in(value, 1, 2, 3, 4, 5) == True + assert is_in(value, 1, 2, 3) == False + assert is_in(value, [1, 2, 3, 4, 5]) == True + + def test_is_in_empty_targets(self): + """Test is_in with empty targets.""" + values = np.array([1, 2, 3]) + result = is_in(values, []) + expected = np.array([False, False, False]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_empty_values(self): + """Test is_in with empty values array.""" + values = np.array([]) + result = is_in(values, 1, 2, 3) + assert len(result) == 0 + + def test_is_in_with_none(self): + """Test is_in with None values.""" + values = np.array([1, 2, None, 4], dtype=object) + result = is_in(values, None) + expected = np.array([False, False, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_all_match(self): + """Test is_in where all values match.""" + values = np.array([1, 1, 1, 1]) + result = is_in(values, 1) + expected = np.array([True, True, True, True]) + np.testing.assert_array_equal(result, expected) + + def test_is_in_no_match(self): + """Test is_in where no values match.""" + values = np.array([1, 2, 3, 4]) + result = is_in(values, 5, 6, 7) + expected = np.array([False, False, False, False]) + np.testing.assert_array_equal(result, expected)