VectorInstitute · amrit110 · May 24, 2024 · May 16, 2024 · May 23, 2024 · May 23, 2024
diff --git a/cyclops/__init__.py b/cyclops/__init__.py
@@ -1 +1,13 @@
 """Cyclops package."""
+
+import pandas as pd
+
+
+# use new copy-view behaviour using Copy-on-Write, which will be default in pandas 3.0
+# see: https://pandas.pydata.org/docs/user_guide/copy_on_write.html#copy-on-write-enabling
+pd.options.mode.copy_on_write = True
+
+
+# whether to infer sequence of str objects as pyarrow string dtype
+# this will be the default in pandas 3.0
+pd.options.future.infer_string = True
diff --git a/cyclops/data/df/handle_types.py b/cyclops/data/df/handle_types.py
@@ -473,7 +473,7 @@ def _numeric_categorical_mapping(
     for i, unique_val in enumerate(unique):
         map_dict[unique_val] = i
 
-    series = series.replace(map_dict)
+    series = series.map(map_dict)
 
     inv_map = {v: k for k, v in map_dict.items()}
     meta = {FEATURE_MAPPING_ATTR: inv_map}

diff --git a/cyclops/data/df/vectorized.py b/cyclops/data/df/vectorized.py
@@ -7,6 +7,7 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import pandas as pd
 
 from cyclops.data.df.normalize import VectorizedNormalizer
 from cyclops.data.df.split import split_idx
@@ -191,7 +192,11 @@ def __init__(
         if not all(isinstance(name, str) for name in axis_names):
             raise ValueError("Axis names must be strings.")
         for i, index in enumerate(indexes):
-            if not isinstance(index, list) and not isinstance(index, np.ndarray):
+            if (
+                not isinstance(index, list)
+                and not pd.api.types.is_string_dtype(index)
+                and not isinstance(index, np.ndarray)
+            ):
                 raise ValueError("Indexes must be a list of list or numpy.ndarray.")
 
             index_ = np.array(index)

diff --git a/docs/source/tutorials/mimiciv/mortality_prediction.ipynb b/docs/source/tutorials/mimiciv/mortality_prediction.ipynb
@@ -196,8 +196,13 @@
     "    ).dt.days <= N\n",
     "    print(f\"Encounters with death timestamp within {N} days: {valid.sum()}\")\n",
     "    # (Died in hospital) & (Death timestamp is defined)\n",
-    "    patient_encounters[\"mortality_outcome\"] = 0\n",
-    "    patient_encounters[\"mortality_outcome\"][valid] = 1\n",
+    "    print(len(patient_encounters))\n",
+    "    patient_encounters[\"mortality_outcome\"] = pd.Series(\n",
+    "        [0] * len(patient_encounters),\n",
+    "        index=patient_encounters.index,\n",
+    "        dtype=\"int64[pyarrow]\",\n",
+    "    )\n",
+    "    patient_encounters.loc[valid, \"mortality_outcome\"] = 1\n",
     "    print(\n",
     "        f\"Encounters with mortality outcome for the model: {patient_encounters['mortality_outcome'].sum()}\",\n",
     "    )\n",
@@ -367,7 +372,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cohort[\"mortality_outcome\"] = cohort[\"mortality_outcome\"].astype(\"int\")\n",
     "fig = px.pie(cohort, names=\"mortality_outcome\")\n",
     "fig.update_traces(textinfo=\"percent+label\")\n",
     "fig.update_layout(title_text=\"Outcome Distribution\")\n",
@@ -1299,7 +1303,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1313,7 +1317,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = ">=3.9, <3.12"
-pandas = "^2.0"
+pandas = {version = "^2.1", extras = ["performance"]}
 numpy = "^1.24.0"
 scikit-learn = "^1.4.0"
 scipy = "^1.11.0"

diff --git a/tests/cyclops/data/df/test_handle_types.py b/tests/cyclops/data/df/test_handle_types.py
@@ -41,9 +41,11 @@ def test_to_dtype():
     series_two = pd.Series([True, False, True])
     assert pd.api.types.is_bool_dtype(to_dtype(series_two, BINARY))
     series_three = pd.Series([0, 1, 3])
-    assert pd.api.types.is_categorical_dtype(to_dtype(series_three, ORDINAL))
+    assert isinstance(
+        to_dtype(series_three, ORDINAL).dtype, pd.api.types.CategoricalDtype
+    )
     series_four = pd.Series(["a", "B", "C"])
-    assert pd.api.types.is_object_dtype(to_dtype(series_four, STRING))
+    assert pd.api.types.is_string_dtype(to_dtype(series_four, STRING))
 
 
 def test_collect_indicators():