Skip to content

Commit

Permalink
Merge pull request #68 from SenteraLLC/DEM-416-refactor-cv
Browse files Browse the repository at this point in the history
[DEM-416] Add `filepath` to `make_1_to_1_plot()`
  • Loading branch information
tnigon committed Jan 23, 2024
2 parents 0474d1f + 79d7e91 commit 906bf01
Show file tree
Hide file tree
Showing 6 changed files with 1,559 additions and 1,433 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ exclude: "(?x)^(\n geoml/feature_data.py|\n geoml/feature_selection.py|\n
\ geoml/tests/|\n conftest.py\n)\n"
repos:
- repo: https://github.com/pycqa/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
Expand All @@ -12,7 +12,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
rev: 7.0.0
hooks:
- id: flake8
exclude: (tests|doc)
Expand Down
2 changes: 1 addition & 1 deletion geoml/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Defines package version. Parsed by setup.py and imported by __init__.py."""

__version__ = "1.0.3"
__version__ = "1.0.4"
56 changes: 32 additions & 24 deletions geoml/feature_selection_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,31 +308,39 @@ def maximize_alpha(alpha, n_feats):
else:
return -alpha

# To be sure we get the global minimum, we need to narrow in a bit more.
# Here, we figure out where `n_feats` is achieved along `xline`. Then, we expand
# one index outside of that range in both directions and pass to optimization function.
xline = logspace(log(alpha_lower), log(alpha_upper), num=100, base=np_e)
yline = [maximize_alpha(a, n_feats) for a in xline]
x_inds = [i for i in range(len(yline)) if yline[i] != 0]
first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1

new_alpha_lower = xline[first_ind]
new_alpha_upper = xline[last_ind]

result = optimize.minimize_scalar(
lambda a: maximize_alpha(a, n_feats),
bounds=(new_alpha_lower, new_alpha_upper),
method="Bounded",
options={"maxiter": 100},
)

if result["success"] is True:
return result["x"]
else:
raise Exception(
"Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
# To be sure we get the global minimum, we need to narrow in a bit more. Here we figure out where `n_feats` is
# achieved along `xline`. Then, we expand one index outside of that range in both directions and pass to
# optimization function.
num_initial = 100
x_inds = []
for n_iter in range(1, 11):
n_iter += 1
xline = logspace(
log(alpha_lower), log(alpha_upper), num=(n_iter * num_initial), base=np_e
)
yline = [maximize_alpha(a, n_feats) for a in xline]
x_inds = [i for i in range(len(yline)) if yline[i] != 0]
try:
first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1

new_alpha_lower = xline[first_ind]
new_alpha_upper = xline[last_ind]

result = optimize.minimize_scalar(
lambda a: maximize_alpha(a, n_feats),
bounds=(new_alpha_lower, new_alpha_upper),
method="Bounded",
options={"maxiter": 100},
)
if result["success"] is True:
return result["x"]
except IndexError:
continue

raise Exception(
"Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
)


def lasso_feature_selection(
Expand Down
38 changes: 25 additions & 13 deletions geoml/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Miscellaneous ML functions that were pulled from `mosaic-modeling` and will be re-integrated into GeoML."""
from os.path import join
from pathlib import Path
from typing import Any, List, Tuple
from typing import Any, List, Tuple, Union

import matplotlib.pyplot as plt
import seaborn as sns
Expand Down Expand Up @@ -106,6 +106,7 @@ def make_1_to_1_plot(
plot_title: str,
response: str,
plot_save: bool = False,
filepath: Union[str, Path] = None,
model_dir: str = None,
model_name: str = None,
hue: List[Any] = None,
Expand All @@ -126,8 +127,10 @@ def make_1_to_1_plot(
plot_save (bool): If True, plot will be saved to `model_dir` as "{model_name}_1_to_1.png"
and nothing will be returned. Otherwise, the figure will be returned.
model_dir (str): If `plot_save`, file directory to save 1:1 plot.
model_name (str): If `plot_save`, model name to be used in 1:1 plot file name
filepath (Union[str, Path]): If `plot_save`, file path to save 1:1 plot.
model_dir (str): If `plot_save`, file directory to save 1:1 plot. Ignored if `filepath` is provided.
model_name (str): If `plot_save`, model name to be used in the filename. Ignored if `filepath` is provided.
hue (list): If provided, this list of same length as `y_pred` should be used to determine
color groupings for plotted points in the 1:1 plot.
Expand Down Expand Up @@ -178,12 +181,15 @@ def make_1_to_1_plot(
title=plot_title,
xlabel=f"Predicted {response}",
ylabel=f"Measured {response}",
xlim=xlim,
ylim=ylim,
xlim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
ylim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
)

ax.plot(
linspace(xlim[0], xlim[1], 2),
linspace(ylim[0], ylim[1], 2),
# linspace(xlim[0], xlim[1], 2),
# linspace(ylim[0], ylim[1], 2),
linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
color="k",
linestyle="--",
linewidth=1,
Expand All @@ -199,9 +205,14 @@ def make_1_to_1_plot(

# save if desired
if plot_save:
fname = join(model_dir, f"{model_name}_1_to_1.png")
Path(model_dir).mkdir(parents=True, exist_ok=True)
plt.savefig(fname, bbox_inches="tight")
filepath = (
Path(join(model_dir, f"{model_name}_1_to_1.png"))
if filepath is None
else filepath
)
filepath = Path(filepath) if not isinstance(filepath, Path) else filepath
filepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(filepath.with_suffix(".png"), bbox_inches="tight")
plt.close()
return None
else:
Expand Down Expand Up @@ -337,21 +348,22 @@ def train_test_split_custom_func(
return group_values, splits


def split_x_y_arrays(response: str, df_train_test: DataFrame) -> List:
def split_x_y_arrays(response: list[str], df_train_test: DataFrame) -> List:
"""Split `DataFrame` from `train_test_split_custom_func()` into the train/test X/y arrays based on `train_test` column.
These arrays can then be used with sklearn model objects for ML workflow.
"""
response = [response] if isinstance(response, str) else response
df_train = df_train_test.loc[df_train_test["train_test"] == "train"].drop(
columns=["train_test"]
)
df_test = df_train_test.loc[df_train_test["train_test"] == "test"].drop(
columns=["train_test"]
)

x_train = df_train.drop([response], axis=1).to_numpy()
x_train = df_train.drop(response, axis=1).to_numpy()
y_train = df_train[response].to_numpy()
x_test = df_test.drop([response], axis=1).to_numpy()
x_test = df_test.drop(response, axis=1).to_numpy()
y_test = df_test[response].to_numpy()

return x_train, y_train, x_test, y_test
Expand Down
Loading

0 comments on commit 906bf01

Please sign in to comment.