Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace .A with .toarray() #361

Merged
merged 8 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ jobs:
micromamba remove -y --force $pkg
pip install --pre --no-deps --only-binary :all: --upgrade --timeout=60 -i $PRE_WHEELS $pkg
done
micromamba remove -y --force formulaic
pip install --no-deps git+https://github.com/matthewwardrop/formulaic
micromamba list
- name: Install repository
shell: bash -el {0}
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
Changelog
=========

Unreleased
----------

**Other changes:**

- Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
- Add support between formulaic and pandas 3.0

4.0.0 - 2024-04-23
------------------

Expand Down
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ requires = [
]

[tool.ruff]
ignore = ["E731", "N802", "N803", "N806"]
line-length = 88
target-version = "py39"

[tool.ruff.lint]
ignore = ["E731", "N802", "N803", "N806"]
select = [
# pyflakes
"F",
Expand All @@ -23,9 +26,8 @@ select = [
# pyupgrade
"UP",
]
target-version = "py39"

[tool.ruff.isort]
[tool.ruff.lint.isort]
known-first-party = ["tabmat"]

[tool.mypy]
Expand Down
4 changes: 2 additions & 2 deletions src/tabmat/categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ def to_sparse_matrix(self):

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
return self.tocsr().A
return self.tocsr().toarray()

def unpack(self):
"""Return the underlying pandas.Categorical."""
Expand Down Expand Up @@ -703,7 +703,7 @@ def _cross_sparse(

term_1 = _row_col_indexing(term_1, rows, L_cols)

res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).A
res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).toarray()
return res

def multiply(self, other) -> SparseMatrix:
Expand Down
2 changes: 1 addition & 1 deletion src/tabmat/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class TabmatMaterializer(FormulaMaterializer):
"""Materializer for pandas input and tabmat output."""

REGISTER_NAME = "tabmat"
REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
REGISTER_INPUTS = ("pandas.core.frame.DataFrame", "pandas.DataFrame")
REGISTER_OUTPUTS = "tabmat"

@override
Expand Down
2 changes: 1 addition & 1 deletion src/tabmat/split_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
out = np.empty(self.shape)
for mat, idx in zip(self.matrices, self.indices):
out[:, idx] = mat.A
out[:, idx] = mat.toarray()
return out

def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]:
Expand Down
6 changes: 3 additions & 3 deletions src/tabmat/standardized_mat.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def getcol(self, i: int):
>>> col_1 = x.getcol(1)
>>> isinstance(col_1, StandardizedMatrix)
True
>>> col_1.A
>>> col_1.toarray()
array([[1.],
[2.],
[1.]])
Expand Down Expand Up @@ -254,7 +254,7 @@ def multiply(self, other) -> DenseMatrix:

def toarray(self) -> np.ndarray:
"""Return array representation of matrix."""
mat_part = self.mat.A
mat_part = self.mat.toarray()
if self.mult is not None:
mat_part = self.mult[None, :] * mat_part
return mat_part + self.shift[None, :]
Expand Down Expand Up @@ -285,7 +285,7 @@ def __getitem__(self, item):
mult_part = np.atleast_1d(mult_part[col])

if isinstance(row, int):
out = mat_part.A
out = mat_part.toarray()
if mult_part is not None:
out = out * mult_part
return out + shift_part
Expand Down
8 changes: 4 additions & 4 deletions tests/test_categorical_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_csr_matvec_categorical(
)
vec = np.random.choice(np.arange(4, dtype=vec_dtype), mat.shape[1])
res = cat_mat.matvec(vec)
np.testing.assert_allclose(res, cat_mat.A.dot(vec))
np.testing.assert_allclose(res, cat_mat.toarray().dot(vec))


@pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"])
Expand All @@ -84,7 +84,7 @@ def test_tocsr(cat_vec, drop_first, missing, cat_missing_method):
cat_mat = CategoricalMatrix(
cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method
)
res = cat_mat.tocsr().A
res = cat_mat.tocsr().toarray()
expected = pd.get_dummies(
cat_vec,
drop_first=drop_first,
Expand Down Expand Up @@ -148,7 +148,7 @@ def test_multiply(cat_vec, drop_first, missing, cat_missing_method):
)
* other
)
np.testing.assert_allclose(actual.A, expected)
np.testing.assert_allclose(actual.toarray(), expected)


@pytest.mark.parametrize("mi_element", [np.nan, None])
Expand Down Expand Up @@ -202,4 +202,4 @@ def test_categorical_indexing(drop_first, missing, cat_missing_method):
drop_first=drop_first,
dummy_na=cat_missing_method == "convert" and missing,
).to_numpy()[:, [0, 1]]
np.testing.assert_allclose(mat[:, [0, 1]].A, expected)
np.testing.assert_allclose(mat[:, [0, 1]].toarray(), expected)
40 changes: 24 additions & 16 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def test_matrix_against_expectation(df, formula, expected):
for res, exp in zip(model_df.matrices, expected.matrices):
assert type(res) == type(exp)
if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
np.testing.assert_array_equal(res.A, res.A)
np.testing.assert_array_equal(res.toarray(), res.toarray())
elif isinstance(res, tm.CategoricalMatrix):
assert (exp.cat == res.cat).all()
assert exp.drop_first == res.drop_first
Expand Down Expand Up @@ -268,7 +268,7 @@ def test_matrix_against_expectation_qcl(df, formula, expected):
for res, exp in zip(model_df.matrices, expected.matrices):
assert type(res) == type(exp)
if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
np.testing.assert_array_equal(res.A, res.A)
np.testing.assert_array_equal(res.toarray(), res.toarray())
elif isinstance(res, tm.CategoricalMatrix):
assert (exp.cat == res.cat).all()
assert exp.drop_first == res.drop_first
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank):
include_intercept=True,
context=0,
)
np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A)
np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.toarray())


@pytest.mark.parametrize(
Expand Down Expand Up @@ -524,7 +524,9 @@ def test_include_intercept(
include_intercept=True,
ensure_full_rank=ensure_full_rank,
)
np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A)
np.testing.assert_array_equal(
model_no_include.toarray(), model_no_intercept.toarray()
)
assert (
model_no_include.model_spec.column_names
== model_no_intercept.model_spec.column_names
Expand All @@ -539,7 +541,7 @@ def test_include_intercept(
include_intercept=False,
ensure_full_rank=ensure_full_rank,
)
np.testing.assert_array_equal(model_include.A, model_intercept.A)
np.testing.assert_array_equal(model_include.toarray(), model_intercept.toarray())
assert (
model_no_include.model_spec.column_names
== model_no_intercept.model_spec.column_names
Expand All @@ -561,7 +563,9 @@ def test_C_state(df, formula, ensure_full_rank):
"str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank
)
model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2])
np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A)
np.testing.assert_array_equal(
model_tabmat.toarray()[:2, :], model_tabmat_2.toarray()
)
np.testing.assert_array_equal(
model_tabmat.matrices[1].cat.categories,
model_tabmat_2.matrices[1].cat.categories,
Expand Down Expand Up @@ -593,8 +597,8 @@ def test_C_state(df, formula, ensure_full_rank):
@pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"])
def test_interactable_vectors(left, right, reverse):
n = left.to_tabmat().shape[0]
left_np = left.to_tabmat().A.reshape((n, -1))
right_np = right.to_tabmat().A.reshape((n, -1))
left_np = left.to_tabmat().toarray().reshape((n, -1))
right_np = right.to_tabmat().toarray().reshape((n, -1))

if reverse:
left_np, right_np = right_np, left_np
Expand Down Expand Up @@ -625,7 +629,7 @@ def test_interactable_vectors(left, right, reverse):

# Test values
np.testing.assert_array_equal(
result_vec.to_tabmat().A.squeeze(), result_np.squeeze()
result_vec.to_tabmat().toarray().squeeze(), result_np.squeeze()
)

# Test names
Expand Down Expand Up @@ -664,12 +668,14 @@ def test_cat_missing_handling(cat_missing_method, cat_missing_name):

assert mat_from_pandas.column_names == mat_from_formula.column_names
assert mat_from_pandas.term_names == mat_from_formula.term_names
np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A)
np.testing.assert_array_equal(mat_from_pandas.toarray(), mat_from_formula.toarray())

mat_from_formula_new = mat_from_formula.model_spec.get_model_matrix(df)
assert mat_from_pandas.column_names == mat_from_formula_new.column_names
assert mat_from_pandas.term_names == mat_from_formula_new.term_names
np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula_new.A)
np.testing.assert_array_equal(
mat_from_pandas.toarray(), mat_from_formula_new.toarray()
)


def test_cat_missing_C():
Expand All @@ -695,9 +701,11 @@ def test_cat_missing_C():

assert result.column_names == expected_names
assert result.model_spec.get_model_matrix(df).column_names == expected_names
np.testing.assert_equal(result.model_spec.get_model_matrix(df).A, result.A)
np.testing.assert_equal(
result.model_spec.get_model_matrix(df[:2]).A, result.A[:2, :]
result.model_spec.get_model_matrix(df).toarray(), result.toarray()
)
np.testing.assert_equal(
result.model_spec.get_model_matrix(df[:2]).toarray(), result.toarray()[:2, :]
)


Expand Down Expand Up @@ -726,7 +734,7 @@ def test_cat_missing_unseen(cat_missing_method):
elif cat_missing_method == "zero":
expected_array = np.array([[1, 0], [0, 0]], dtype=np.float64)

np.testing.assert_array_equal(result_unseen.A, expected_array)
np.testing.assert_array_equal(result_unseen.toarray(), expected_array)


def test_cat_missing_interactions():
Expand Down Expand Up @@ -799,9 +807,9 @@ def test_unseen_missing(cat_missing_method):
result_seen.model_spec.get_model_matrix(df_unseen)
elif cat_missing_method == "zero":
result_unseen = result_seen.model_spec.get_model_matrix(df_unseen)
assert result_unseen.A.shape == (3, 2)
assert result_unseen.toarray().shape == (3, 2)
np.testing.assert_array_equal(
result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]])
result_unseen.toarray(), np.array([[1, 0], [0, 1], [0, 0]])
)
assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"]

Expand Down
Loading
Loading