Skip to content

Commit

Permalink
replace .A with .toarray() (#361)
Browse files Browse the repository at this point in the history
* replace `A` with `toarray`

* changelog

* also fix formulaic error for nightly

* use pep 517

* add pandas.DataFrame to the inputs

* trying out a fix for windows

* revert conda-build changes
  • Loading branch information
MarcAntoineSchmidtQC committed May 24, 2024
1 parent 4ffbdd3 commit b246585
Show file tree
Hide file tree
Showing 13 changed files with 107 additions and 85 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ jobs:
micromamba remove -y --force $pkg
pip install --pre --no-deps --only-binary :all: --upgrade --timeout=60 -i $PRE_WHEELS $pkg
done
micromamba remove -y --force formulaic
pip install --no-deps git+https://github.com/matthewwardrop/formulaic
micromamba list
- name: Install repository
shell: bash -el {0}
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
Changelog
=========

Unreleased
----------

**Other changes:**

- Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
- Add support between formulaic and pandas 3.0

4.0.0 - 2024-04-23
------------------

Expand Down
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ requires = [
]

[tool.ruff]
ignore = ["E731", "N802", "N803", "N806"]
line-length = 88
target-version = "py39"

[tool.ruff.lint]
ignore = ["E731", "N802", "N803", "N806"]
select = [
# pyflakes
"F",
Expand All @@ -23,9 +26,8 @@ select = [
# pyupgrade
"UP",
]
target-version = "py39"

[tool.ruff.isort]
[tool.ruff.lint.isort]
known-first-party = ["tabmat"]

[tool.mypy]
Expand Down
4 changes: 2 additions & 2 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ def to_sparse_matrix(self):

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
return self.tocsr().A
return self.tocsr().toarray()

def unpack(self):
"""Return the underlying pandas.Categorical."""
Expand Down Expand Up @@ -703,7 +703,7 @@ def _cross_sparse(

term_1 = _row_col_indexing(term_1, rows, L_cols)

res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).A
res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).toarray()
return res

def multiply(self, other) -> SparseMatrix:
Expand Down
2 changes: 1 addition & 1 deletion src/tabmat/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class TabmatMaterializer(FormulaMaterializer):
"""Materializer for pandas input and tabmat output."""

REGISTER_NAME = "tabmat"
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
REGISTER_INPUTS = ("pandas.core.frame.DataFrame", "pandas.DataFrame")
REGISTER_OUTPUTS = "tabmat"

@override
Expand Down
2 changes: 1 addition & 1 deletion src/tabmat/split_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
out = np.empty(self.shape)
for mat, idx in zip(self.matrices, self.indices):
out[:, idx] = mat.A
out[:, idx] = mat.toarray()
return out

def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]:
Expand Down
6 changes: 3 additions & 3 deletions src/tabmat/standardized_mat.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def getcol(self, i: int):
>>> col_1 = x.getcol(1)
>>> isinstance(col_1, StandardizedMatrix)
True
>>> col_1.A
>>> col_1.toarray()
array([[1.],
[2.],
[1.]])
Expand Down Expand Up @@ -254,7 +254,7 @@ def multiply(self, other) -> DenseMatrix:

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
mat_part = self.mat.A
mat_part = self.mat.toarray()
if self.mult is not None:
mat_part = self.mult[None, :] * mat_part
return mat_part + self.shift[None, :]
Expand Down Expand Up @@ -285,7 +285,7 @@ def __getitem__(self, item):
mult_part = np.atleast_1d(mult_part[col])

if isinstance(row, int):
out = mat_part.A
out = mat_part.toarray()
if mult_part is not None:
out = out * mult_part
return out + shift_part
Expand Down
8 changes: 4 additions & 4 deletions tests/test_categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_csr_matvec_categorical(
)
vec = np.random.choice(np.arange(4, dtype=vec_dtype), mat.shape[1])
res = cat_mat.matvec(vec)
np.testing.assert_allclose(res, cat_mat.A.dot(vec))
np.testing.assert_allclose(res, cat_mat.toarray().dot(vec))


@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"])
Expand All @@ -84,7 +84,7 @@ def test_tocsr(cat_vec, drop_first, missing, cat_missing_method):
cat_mat = CategoricalMatrix(
cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method
)
res = cat_mat.tocsr().A
res = cat_mat.tocsr().toarray()
expected = pd.get_dummies(
cat_vec,
drop_first=drop_first,
Expand Down Expand Up @@ -148,7 +148,7 @@ def test_multiply(cat_vec, drop_first, missing, cat_missing_method):
)
* other
)
np.testing.assert_allclose(actual.A, expected)
np.testing.assert_allclose(actual.toarray(), expected)


@pytest.mark.parametrize("mi_element", [np.nan, None])
Expand Down Expand Up @@ -202,4 +202,4 @@ def test_categorical_indexing(drop_first, missing, cat_missing_method):
drop_first=drop_first,
dummy_na=cat_missing_method == "convert" and missing,
).to_numpy()[:, [0, 1]]
np.testing.assert_allclose(mat[:, [0, 1]].A, expected)
np.testing.assert_allclose(mat[:, [0, 1]].toarray(), expected)
40 changes: 24 additions & 16 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test_matrix_against_expectation(df, formula, expected):
for res, exp in zip(model_df.matrices, expected.matrices):
assert type(res) == type(exp)
if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
np.testing.assert_array_equal(res.A, res.A)
np.testing.assert_array_equal(res.toarray(), res.toarray())
elif isinstance(res, tm.CategoricalMatrix):
assert (exp.cat == res.cat).all()
assert exp.drop_first == res.drop_first
Expand Down Expand Up @@ -268,7 +268,7 @@ def test_matrix_against_expectation_qcl(df, formula, expected):
for res, exp in zip(model_df.matrices, expected.matrices):
assert type(res) == type(exp)
if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
np.testing.assert_array_equal(res.A, res.A)
np.testing.assert_array_equal(res.toarray(), res.toarray())
elif isinstance(res, tm.CategoricalMatrix):
assert (exp.cat == res.cat).all()
assert exp.drop_first == res.drop_first
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank):
include_intercept=True,
context=0,
)
np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A)
np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.toarray())


@pytest.mark.parametrize(
Expand Down Expand Up @@ -524,7 +524,9 @@ def test_include_intercept(
include_intercept=True,
ensure_full_rank=ensure_full_rank,
)
np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A)
np.testing.assert_array_equal(
model_no_include.toarray(), model_no_intercept.toarray()
)
assert (
model_no_include.model_spec.column_names
== model_no_intercept.model_spec.column_names
Expand All @@ -539,7 +541,7 @@ def test_include_intercept(
include_intercept=False,
ensure_full_rank=ensure_full_rank,
)
np.testing.assert_array_equal(model_include.A, model_intercept.A)
np.testing.assert_array_equal(model_include.toarray(), model_intercept.toarray())
assert (
model_no_include.model_spec.column_names
== model_no_intercept.model_spec.column_names
Expand All @@ -561,7 +563,9 @@ def test_C_state(df, formula, ensure_full_rank):
"str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank
)
model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2])
np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A)
np.testing.assert_array_equal(
model_tabmat.toarray()[:2, :], model_tabmat_2.toarray()
)
np.testing.assert_array_equal(
model_tabmat.matrices[1].cat.categories,
model_tabmat_2.matrices[1].cat.categories,
Expand Down Expand Up @@ -593,8 +597,8 @@ def test_C_state(df, formula, ensure_full_rank):
@pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"])
def test_interactable_vectors(left, right, reverse):
n = left.to_tabmat().shape[0]
left_np = left.to_tabmat().A.reshape((n, -1))
right_np = right.to_tabmat().A.reshape((n, -1))
left_np = left.to_tabmat().toarray().reshape((n, -1))
right_np = right.to_tabmat().toarray().reshape((n, -1))

if reverse:
left_np, right_np = right_np, left_np
Expand Down Expand Up @@ -625,7 +629,7 @@ def test_interactable_vectors(left, right, reverse):

# Test values
np.testing.assert_array_equal(
result_vec.to_tabmat().A.squeeze(), result_np.squeeze()
result_vec.to_tabmat().toarray().squeeze(), result_np.squeeze()
)

# Test names
Expand Down Expand Up @@ -664,12 +668,14 @@ def test_cat_missing_handling(cat_missing_method, cat_missing_name):

assert mat_from_pandas.column_names == mat_from_formula.column_names
assert mat_from_pandas.term_names == mat_from_formula.term_names
np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A)
np.testing.assert_array_equal(mat_from_pandas.toarray(), mat_from_formula.toarray())

mat_from_formula_new = mat_from_formula.model_spec.get_model_matrix(df)
assert mat_from_pandas.column_names == mat_from_formula_new.column_names
assert mat_from_pandas.term_names == mat_from_formula_new.term_names
np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula_new.A)
np.testing.assert_array_equal(
mat_from_pandas.toarray(), mat_from_formula_new.toarray()
)


def test_cat_missing_C():
Expand All @@ -695,9 +701,11 @@ def test_cat_missing_C():

assert result.column_names == expected_names
assert result.model_spec.get_model_matrix(df).column_names == expected_names
np.testing.assert_equal(result.model_spec.get_model_matrix(df).A, result.A)
np.testing.assert_equal(
result.model_spec.get_model_matrix(df[:2]).A, result.A[:2, :]
result.model_spec.get_model_matrix(df).toarray(), result.toarray()
)
np.testing.assert_equal(
result.model_spec.get_model_matrix(df[:2]).toarray(), result.toarray()[:2, :]
)


Expand Down Expand Up @@ -726,7 +734,7 @@ def test_cat_missing_unseen(cat_missing_method):
elif cat_missing_method == "zero":
expected_array = np.array([[1, 0], [0, 0]], dtype=np.float64)

np.testing.assert_array_equal(result_unseen.A, expected_array)
np.testing.assert_array_equal(result_unseen.toarray(), expected_array)


def test_cat_missing_interactions():
Expand Down Expand Up @@ -799,9 +807,9 @@ def test_unseen_missing(cat_missing_method):
result_seen.model_spec.get_model_matrix(df_unseen)
elif cat_missing_method == "zero":
result_unseen = result_seen.model_spec.get_model_matrix(df_unseen)
assert result_unseen.A.shape == (3, 2)
assert result_unseen.toarray().shape == (3, 2)
np.testing.assert_array_equal(
result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]])
result_unseen.toarray(), np.array([[1, 0], [0, 1], [0, 0]])
)
assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"]

Expand Down
Loading

0 comments on commit b246585

Please sign in to comment.