replace .A with .toarray() (#361)

* replace `A` with `toarray` * changelog * also fix formulaic error for nightly * use pep 517 * add pandas.DataFrame to the inputs * trying out a fix for windows * revert conda-build changes
Quantco · May 24, 2024 · b246585 · b246585
1 parent 4ffbdd3
commit b246585
Show file tree

Hide file tree

Showing 13 changed files with 107 additions and 85 deletions.
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
@@ -37,6 +37,8 @@ jobs:
             micromamba remove -y --force $pkg
             pip install --pre --no-deps --only-binary :all: --upgrade --timeout=60 -i $PRE_WHEELS $pkg
           done
+          micromamba remove -y --force formulaic
+          pip install --no-deps git+https://github.com/matthewwardrop/formulaic
           micromamba list
       - name: Install repository
         shell: bash -el {0}

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,14 @@
 Changelog
 =========
 
+Unreleased
+----------
+
+**Other changes:**
+
+- Removed reference to the ``.A`` attribute and replaced it with ``.toarray()``.
+- Add support between formulaic and pandas 3.0
+
 4.0.0 - 2024-04-23
 ------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,8 +9,11 @@ requires = [
 ]
 
 [tool.ruff]
-ignore = ["E731", "N802", "N803", "N806"]
 line-length = 88
+target-version = "py39"
+
+[tool.ruff.lint]
+ignore = ["E731", "N802", "N803", "N806"]
 select = [
   # pyflakes
   "F",
@@ -23,9 +26,8 @@ select = [
   # pyupgrade
   "UP",
 ]
-target-version = "py39"
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["tabmat"]
 
 [tool.mypy]

diff --git a/src/tabmat/categorical_matrix.py b/src/tabmat/categorical_matrix.py
@@ -586,7 +586,7 @@ def to_sparse_matrix(self):
 
     def toarray(self) -> np.ndarray:
         """Return array representation of matrix."""
-        return self.tocsr().A
+        return self.tocsr().toarray()
 
     def unpack(self):
         """Return the underlying pandas.Categorical."""
@@ -703,7 +703,7 @@ def _cross_sparse(
 
         term_1 = _row_col_indexing(term_1, rows, L_cols)
 
-        res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).A
+        res = term_1.T.dot(_row_col_indexing(other, rows, R_cols)).toarray()
         return res
 
     def multiply(self, other) -> SparseMatrix:

diff --git a/src/tabmat/formula.py b/src/tabmat/formula.py
@@ -29,7 +29,7 @@ class TabmatMaterializer(FormulaMaterializer):
     """Materializer for pandas input and tabmat output."""
 
     REGISTER_NAME = "tabmat"
-    REGISTER_INPUTS = ("pandas.core.frame.DataFrame",)
+    REGISTER_INPUTS = ("pandas.core.frame.DataFrame", "pandas.DataFrame")
     REGISTER_OUTPUTS = "tabmat"
 
     @override

diff --git a/src/tabmat/split_matrix.py b/src/tabmat/split_matrix.py
@@ -306,7 +306,7 @@ def toarray(self) -> np.ndarray:
         """Return array representation of matrix."""
         out = np.empty(self.shape)
         for mat, idx in zip(self.matrices, self.indices):
-            out[:, idx] = mat.A
+            out[:, idx] = mat.toarray()
         return out
 
     def getcol(self, i: int) -> Union[np.ndarray, sps.csr_matrix]:

diff --git a/src/tabmat/standardized_mat.py b/src/tabmat/standardized_mat.py
@@ -105,7 +105,7 @@ def getcol(self, i: int):
         >>> col_1 = x.getcol(1)
         >>> isinstance(col_1, StandardizedMatrix)
         True
-        >>> col_1.A
+        >>> col_1.toarray()
         array([[1.],
                [2.],
                [1.]])
@@ -254,7 +254,7 @@ def multiply(self, other) -> DenseMatrix:
 
     def toarray(self) -> np.ndarray:
         """Return array representation of matrix."""
-        mat_part = self.mat.A
+        mat_part = self.mat.toarray()
         if self.mult is not None:
             mat_part = self.mult[None, :] * mat_part
         return mat_part + self.shift[None, :]
@@ -285,7 +285,7 @@ def __getitem__(self, item):
             mult_part = np.atleast_1d(mult_part[col])
 
         if isinstance(row, int):
-            out = mat_part.A
+            out = mat_part.toarray()
             if mult_part is not None:
                 out = out * mult_part
             return out + shift_part

diff --git a/tests/test_categorical_matrix.py b/tests/test_categorical_matrix.py
@@ -65,7 +65,7 @@ def test_csr_matvec_categorical(
     )
     vec = np.random.choice(np.arange(4, dtype=vec_dtype), mat.shape[1])
     res = cat_mat.matvec(vec)
-    np.testing.assert_allclose(res, cat_mat.A.dot(vec))
+    np.testing.assert_allclose(res, cat_mat.toarray().dot(vec))
 
 
 @pytest.mark.parametrize("drop_first", [True, False], ids=["drop_first", "no_drop"])
@@ -84,7 +84,7 @@ def test_tocsr(cat_vec, drop_first, missing, cat_missing_method):
     cat_mat = CategoricalMatrix(
         cat_vec, drop_first=drop_first, cat_missing_method=cat_missing_method
     )
-    res = cat_mat.tocsr().A
+    res = cat_mat.tocsr().toarray()
     expected = pd.get_dummies(
         cat_vec,
         drop_first=drop_first,
@@ -148,7 +148,7 @@ def test_multiply(cat_vec, drop_first, missing, cat_missing_method):
         )
         * other
     )
-    np.testing.assert_allclose(actual.A, expected)
+    np.testing.assert_allclose(actual.toarray(), expected)
 
 
 @pytest.mark.parametrize("mi_element", [np.nan, None])
@@ -202,4 +202,4 @@ def test_categorical_indexing(drop_first, missing, cat_missing_method):
         drop_first=drop_first,
         dummy_na=cat_missing_method == "convert" and missing,
     ).to_numpy()[:, [0, 1]]
-    np.testing.assert_allclose(mat[:, [0, 1]].A, expected)
+    np.testing.assert_allclose(mat[:, [0, 1]].toarray(), expected)
diff --git a/tests/test_formula.py b/tests/test_formula.py
@@ -159,7 +159,7 @@ def test_matrix_against_expectation(df, formula, expected):
     for res, exp in zip(model_df.matrices, expected.matrices):
         assert type(res) == type(exp)
         if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
-            np.testing.assert_array_equal(res.A, res.A)
+            np.testing.assert_array_equal(res.toarray(), res.toarray())
         elif isinstance(res, tm.CategoricalMatrix):
             assert (exp.cat == res.cat).all()
             assert exp.drop_first == res.drop_first
@@ -268,7 +268,7 @@ def test_matrix_against_expectation_qcl(df, formula, expected):
     for res, exp in zip(model_df.matrices, expected.matrices):
         assert type(res) == type(exp)
         if isinstance(res, (tm.DenseMatrix, tm.SparseMatrix)):
-            np.testing.assert_array_equal(res.A, res.A)
+            np.testing.assert_array_equal(res.toarray(), res.toarray())
         elif isinstance(res, tm.CategoricalMatrix):
             assert (exp.cat == res.cat).all()
             assert exp.drop_first == res.drop_first
@@ -311,7 +311,7 @@ def test_matrix_against_pandas(df, formula, ensure_full_rank):
         include_intercept=True,
         context=0,
     )
-    np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.A)
+    np.testing.assert_array_equal(model_df.to_numpy(), model_tabmat.toarray())
 
 
 @pytest.mark.parametrize(
@@ -524,7 +524,9 @@ def test_include_intercept(
         include_intercept=True,
         ensure_full_rank=ensure_full_rank,
     )
-    np.testing.assert_array_equal(model_no_include.A, model_no_intercept.A)
+    np.testing.assert_array_equal(
+        model_no_include.toarray(), model_no_intercept.toarray()
+    )
     assert (
         model_no_include.model_spec.column_names
         == model_no_intercept.model_spec.column_names
@@ -539,7 +541,7 @@ def test_include_intercept(
         include_intercept=False,
         ensure_full_rank=ensure_full_rank,
     )
-    np.testing.assert_array_equal(model_include.A, model_intercept.A)
+    np.testing.assert_array_equal(model_include.toarray(), model_intercept.toarray())
     assert (
         model_no_include.model_spec.column_names
         == model_no_intercept.model_spec.column_names
@@ -561,7 +563,9 @@ def test_C_state(df, formula, ensure_full_rank):
         "str_1 : cat_1 + 1", df, cat_threshold=0, ensure_full_rank=ensure_full_rank
     )
     model_tabmat_2 = model_tabmat.model_spec.get_model_matrix(df[:2])
-    np.testing.assert_array_equal(model_tabmat.A[:2, :], model_tabmat_2.A)
+    np.testing.assert_array_equal(
+        model_tabmat.toarray()[:2, :], model_tabmat_2.toarray()
+    )
     np.testing.assert_array_equal(
         model_tabmat.matrices[1].cat.categories,
         model_tabmat_2.matrices[1].cat.categories,
@@ -593,8 +597,8 @@ def test_C_state(df, formula, ensure_full_rank):
 @pytest.mark.parametrize("reverse", [False, True], ids=["not_reversed", "reversed"])
 def test_interactable_vectors(left, right, reverse):
     n = left.to_tabmat().shape[0]
-    left_np = left.to_tabmat().A.reshape((n, -1))
-    right_np = right.to_tabmat().A.reshape((n, -1))
+    left_np = left.to_tabmat().toarray().reshape((n, -1))
+    right_np = right.to_tabmat().toarray().reshape((n, -1))
 
     if reverse:
         left_np, right_np = right_np, left_np
@@ -625,7 +629,7 @@ def test_interactable_vectors(left, right, reverse):
 
     # Test values
     np.testing.assert_array_equal(
-        result_vec.to_tabmat().A.squeeze(), result_np.squeeze()
+        result_vec.to_tabmat().toarray().squeeze(), result_np.squeeze()
     )
 
     # Test names
@@ -664,12 +668,14 @@ def test_cat_missing_handling(cat_missing_method, cat_missing_name):
 
     assert mat_from_pandas.column_names == mat_from_formula.column_names
     assert mat_from_pandas.term_names == mat_from_formula.term_names
-    np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula.A)
+    np.testing.assert_array_equal(mat_from_pandas.toarray(), mat_from_formula.toarray())
 
     mat_from_formula_new = mat_from_formula.model_spec.get_model_matrix(df)
     assert mat_from_pandas.column_names == mat_from_formula_new.column_names
     assert mat_from_pandas.term_names == mat_from_formula_new.term_names
-    np.testing.assert_array_equal(mat_from_pandas.A, mat_from_formula_new.A)
+    np.testing.assert_array_equal(
+        mat_from_pandas.toarray(), mat_from_formula_new.toarray()
+    )
 
 
 def test_cat_missing_C():
@@ -695,9 +701,11 @@ def test_cat_missing_C():
 
     assert result.column_names == expected_names
     assert result.model_spec.get_model_matrix(df).column_names == expected_names
-    np.testing.assert_equal(result.model_spec.get_model_matrix(df).A, result.A)
     np.testing.assert_equal(
-        result.model_spec.get_model_matrix(df[:2]).A, result.A[:2, :]
+        result.model_spec.get_model_matrix(df).toarray(), result.toarray()
+    )
+    np.testing.assert_equal(
+        result.model_spec.get_model_matrix(df[:2]).toarray(), result.toarray()[:2, :]
     )
 
 
@@ -726,7 +734,7 @@ def test_cat_missing_unseen(cat_missing_method):
     elif cat_missing_method == "zero":
         expected_array = np.array([[1, 0], [0, 0]], dtype=np.float64)
 
-    np.testing.assert_array_equal(result_unseen.A, expected_array)
+    np.testing.assert_array_equal(result_unseen.toarray(), expected_array)
 
 
 def test_cat_missing_interactions():
@@ -799,9 +807,9 @@ def test_unseen_missing(cat_missing_method):
             result_seen.model_spec.get_model_matrix(df_unseen)
     elif cat_missing_method == "zero":
         result_unseen = result_seen.model_spec.get_model_matrix(df_unseen)
-        assert result_unseen.A.shape == (3, 2)
+        assert result_unseen.toarray().shape == (3, 2)
         np.testing.assert_array_equal(
-            result_unseen.A, np.array([[1, 0], [0, 1], [0, 0]])
+            result_unseen.toarray(), np.array([[1, 0], [0, 1], [0, 0]])
         )
         assert result_unseen.column_names == ["cat_1[a]", "cat_1[b]"]