Merge pull request #68 from SenteraLLC/DEM-416-refactor-cv

[DEM-416] Add `filepath` to `make_1_to_1_plot()`
SenteraLLC · Jan 23, 2024 · 906bf01 · 906bf01
2 parents 0474d1f + 79d7e91
commit 906bf01
Show file tree

Hide file tree

Showing 6 changed files with 1,559 additions and 1,433 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ exclude: "(?x)^(\n    geoml/feature_data.py|\n    geoml/feature_selection.py|\n
     \   geoml/tests/|\n    conftest.py\n)\n"
 repos:
 -   repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     -   id: isort
         name: isort (python)
@@ -12,7 +12,7 @@ repos:
     hooks:
     -   id: black
 -   repo: https://github.com/pycqa/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
     -   id: flake8
         exclude: (tests|doc)

diff --git a/geoml/_version.py b/geoml/_version.py
@@ -1,3 +1,3 @@
 """Defines package version.  Parsed by setup.py and imported by __init__.py."""
 
-__version__ = "1.0.3"
+__version__ = "1.0.4"
diff --git a/geoml/feature_selection_new.py b/geoml/feature_selection_new.py
@@ -308,31 +308,39 @@ def maximize_alpha(alpha, n_feats):
         else:
             return -alpha
 
-    # To be sure we get the global minimum, we need to narrow in a bit more.
-    # Here, we figure out where `n_feats` is achieved along `xline`. Then, we expand
-    # one index outside of that range in both directions and pass to optimization function.
-    xline = logspace(log(alpha_lower), log(alpha_upper), num=100, base=np_e)
-    yline = [maximize_alpha(a, n_feats) for a in xline]
-    x_inds = [i for i in range(len(yline)) if yline[i] != 0]
-    first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
-    last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1
-
-    new_alpha_lower = xline[first_ind]
-    new_alpha_upper = xline[last_ind]
-
-    result = optimize.minimize_scalar(
-        lambda a: maximize_alpha(a, n_feats),
-        bounds=(new_alpha_lower, new_alpha_upper),
-        method="Bounded",
-        options={"maxiter": 100},
-    )
-
-    if result["success"] is True:
-        return result["x"]
-    else:
-        raise Exception(
-            "Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
+    # To be sure we get the global minimum, we need to narrow in a bit more. Here we figure out where `n_feats` is
+    # achieved along `xline`. Then, we expand one index outside of that range in both directions and pass to
+    # optimization function.
+    num_initial = 100
+    x_inds = []
+    for n_iter in range(1, 11):
+        n_iter += 1
+        xline = logspace(
+            log(alpha_lower), log(alpha_upper), num=(n_iter * num_initial), base=np_e
         )
+        yline = [maximize_alpha(a, n_feats) for a in xline]
+        x_inds = [i for i in range(len(yline)) if yline[i] != 0]
+        try:
+            first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
+            last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1
+
+            new_alpha_lower = xline[first_ind]
+            new_alpha_upper = xline[last_ind]
+
+            result = optimize.minimize_scalar(
+                lambda a: maximize_alpha(a, n_feats),
+                bounds=(new_alpha_lower, new_alpha_upper),
+                method="Bounded",
+                options={"maxiter": 100},
+            )
+            if result["success"] is True:
+                return result["x"]
+        except IndexError:
+            continue
+
+    raise Exception(
+        "Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
+    )
 
 
 def lasso_feature_selection(

diff --git a/geoml/utils.py b/geoml/utils.py
@@ -1,7 +1,7 @@
 """Miscellaneous ML functions that were pulled from `mosaic-modeling` and will be re-integrated into GeoML."""
 from os.path import join
 from pathlib import Path
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, Union
 
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -106,6 +106,7 @@ def make_1_to_1_plot(
     plot_title: str,
     response: str,
     plot_save: bool = False,
+    filepath: Union[str, Path] = None,
     model_dir: str = None,
     model_name: str = None,
     hue: List[Any] = None,
@@ -126,8 +127,10 @@ def make_1_to_1_plot(
         plot_save (bool): If True, plot will be saved to `model_dir` as "{model_name}_1_to_1.png"
             and nothing will be returned. Otherwise, the figure will be returned.
 
-        model_dir (str): If `plot_save`, file directory to save 1:1 plot.
-        model_name (str): If `plot_save`, model name to be used in 1:1 plot file name
+        filepath (Union[str, Path]): If `plot_save`, file path to save 1:1 plot.
+
+        model_dir (str): If `plot_save`, file directory to save 1:1 plot. Ignored if `filepath` is provided.
+        model_name (str): If `plot_save`, model name to be used in the filename. Ignored if `filepath` is provided.
 
         hue (list): If provided, this list of same length as `y_pred` should be used to determine
             color groupings for plotted points in the 1:1 plot.
@@ -178,12 +181,15 @@ def make_1_to_1_plot(
         title=plot_title,
         xlabel=f"Predicted {response}",
         ylabel=f"Measured {response}",
-        xlim=xlim,
-        ylim=ylim,
+        xlim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
+        ylim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
     )
+
     ax.plot(
-        linspace(xlim[0], xlim[1], 2),
-        linspace(ylim[0], ylim[1], 2),
+        # linspace(xlim[0], xlim[1], 2),
+        # linspace(ylim[0], ylim[1], 2),
+        linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
+        linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
         color="k",
         linestyle="--",
         linewidth=1,
@@ -199,9 +205,14 @@ def make_1_to_1_plot(
 
     # save if desired
     if plot_save:
-        fname = join(model_dir, f"{model_name}_1_to_1.png")
-        Path(model_dir).mkdir(parents=True, exist_ok=True)
-        plt.savefig(fname, bbox_inches="tight")
+        filepath = (
+            Path(join(model_dir, f"{model_name}_1_to_1.png"))
+            if filepath is None
+            else filepath
+        )
+        filepath = Path(filepath) if not isinstance(filepath, Path) else filepath
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        plt.savefig(filepath.with_suffix(".png"), bbox_inches="tight")
         plt.close()
         return None
     else:
@@ -337,21 +348,22 @@ def train_test_split_custom_func(
     return group_values, splits
 
 
-def split_x_y_arrays(response: str, df_train_test: DataFrame) -> List:
+def split_x_y_arrays(response: list[str], df_train_test: DataFrame) -> List:
     """Split `DataFrame` from `train_test_split_custom_func()` into the train/test X/y arrays based on `train_test` column.
 
     These arrays can then be used with sklearn model objects for ML workflow.
     """
+    response = [response] if isinstance(response, str) else response
     df_train = df_train_test.loc[df_train_test["train_test"] == "train"].drop(
         columns=["train_test"]
     )
     df_test = df_train_test.loc[df_train_test["train_test"] == "test"].drop(
         columns=["train_test"]
     )
 
-    x_train = df_train.drop([response], axis=1).to_numpy()
+    x_train = df_train.drop(response, axis=1).to_numpy()
     y_train = df_train[response].to_numpy()
-    x_test = df_test.drop([response], axis=1).to_numpy()
+    x_test = df_test.drop(response, axis=1).to_numpy()
     y_test = df_test[response].to_numpy()
 
     return x_train, y_train, x_test, y_test