Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DEM-416] Add filepath to make_1_to_1_plot() #68

Merged
merged 10 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ exclude: "(?x)^(\n geoml/feature_data.py|\n geoml/feature_selection.py|\n
\ geoml/tests/|\n conftest.py\n)\n"
repos:
- repo: https://github.com/pycqa/isort
rev: 5.12.0
rev: 5.13.2
hooks:
- id: isort
name: isort (python)
Expand All @@ -12,7 +12,7 @@ repos:
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
rev: 7.0.0
hooks:
- id: flake8
exclude: (tests|doc)
Expand Down
2 changes: 1 addition & 1 deletion geoml/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Defines package version. Parsed by setup.py and imported by __init__.py."""

__version__ = "1.0.3"
__version__ = "1.0.4"
56 changes: 32 additions & 24 deletions geoml/feature_selection_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,31 +308,39 @@ def maximize_alpha(alpha, n_feats):
else:
return -alpha

# To be sure we get the global minimum, we need to narrow in a bit more.
# Here, we figure out where `n_feats` is achieved along `xline`. Then, we expand
# one index outside of that range in both directions and pass to optimization function.
xline = logspace(log(alpha_lower), log(alpha_upper), num=100, base=np_e)
yline = [maximize_alpha(a, n_feats) for a in xline]
x_inds = [i for i in range(len(yline)) if yline[i] != 0]
first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1

new_alpha_lower = xline[first_ind]
new_alpha_upper = xline[last_ind]

result = optimize.minimize_scalar(
lambda a: maximize_alpha(a, n_feats),
bounds=(new_alpha_lower, new_alpha_upper),
method="Bounded",
options={"maxiter": 100},
)

if result["success"] is True:
return result["x"]
else:
raise Exception(
"Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
# To be sure we get the global minimum, we need to narrow in a bit more. Here we figure out where `n_feats` is
# achieved along `xline`. Then, we expand one index outside of that range in both directions and pass to
# optimization function.
num_initial = 100
x_inds = []
for n_iter in range(1, 11):
n_iter += 1
xline = logspace(
log(alpha_lower), log(alpha_upper), num=(n_iter * num_initial), base=np_e
)
yline = [maximize_alpha(a, n_feats) for a in xline]
x_inds = [i for i in range(len(yline)) if yline[i] != 0]
try:
first_ind = x_inds[0] if x_inds[0] == 0 else x_inds[0] - 1
last_ind = x_inds[-1] if x_inds[-1] == 99 else x_inds[-1] + 1

new_alpha_lower = xline[first_ind]
new_alpha_upper = xline[last_ind]

result = optimize.minimize_scalar(
lambda a: maximize_alpha(a, n_feats),
bounds=(new_alpha_lower, new_alpha_upper),
method="Bounded",
options={"maxiter": 100},
)
if result["success"] is True:
return result["x"]
except IndexError:
continue

raise Exception(
"Optimization step was unsuccessful. This is a rare error that could result from a low sample size when narrowing alpha range in step (3)."
)


def lasso_feature_selection(
Expand Down
38 changes: 25 additions & 13 deletions geoml/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Miscellaneous ML functions that were pulled from `mosaic-modeling` and will be re-integrated into GeoML."""
from os.path import join
from pathlib import Path
from typing import Any, List, Tuple
from typing import Any, List, Tuple, Union

import matplotlib.pyplot as plt
import seaborn as sns
Expand Down Expand Up @@ -106,6 +106,7 @@ def make_1_to_1_plot(
plot_title: str,
response: str,
plot_save: bool = False,
filepath: Union[str, Path] = None,
model_dir: str = None,
model_name: str = None,
hue: List[Any] = None,
Expand All @@ -126,8 +127,10 @@ def make_1_to_1_plot(
plot_save (bool): If True, plot will be saved to `model_dir` as "{model_name}_1_to_1.png"
and nothing will be returned. Otherwise, the figure will be returned.

model_dir (str): If `plot_save`, file directory to save 1:1 plot.
model_name (str): If `plot_save`, model name to be used in 1:1 plot file name
filepath (Union[str, Path]): If `plot_save`, file path to save 1:1 plot.

model_dir (str): If `plot_save`, file directory to save 1:1 plot. Ignored if `filepath` is provided.
model_name (str): If `plot_save`, model name to be used in the filename. Ignored if `filepath` is provided.

hue (list): If provided, this list of same length as `y_pred` should be used to determine
color groupings for plotted points in the 1:1 plot.
Expand Down Expand Up @@ -178,12 +181,15 @@ def make_1_to_1_plot(
title=plot_title,
xlabel=f"Predicted {response}",
ylabel=f"Measured {response}",
xlim=xlim,
ylim=ylim,
xlim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
ylim=linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
)

ax.plot(
linspace(xlim[0], xlim[1], 2),
linspace(ylim[0], ylim[1], 2),
# linspace(xlim[0], xlim[1], 2),
# linspace(ylim[0], ylim[1], 2),
linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
linspace(min(xlim[0], ylim[0]), max(xlim[1], ylim[1]), 2),
color="k",
linestyle="--",
linewidth=1,
Expand All @@ -199,9 +205,14 @@ def make_1_to_1_plot(

# save if desired
if plot_save:
fname = join(model_dir, f"{model_name}_1_to_1.png")
Path(model_dir).mkdir(parents=True, exist_ok=True)
plt.savefig(fname, bbox_inches="tight")
filepath = (
Path(join(model_dir, f"{model_name}_1_to_1.png"))
if filepath is None
else filepath
)
filepath = Path(filepath) if not isinstance(filepath, Path) else filepath
filepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(filepath.with_suffix(".png"), bbox_inches="tight")
plt.close()
return None
else:
Expand Down Expand Up @@ -337,21 +348,22 @@ def train_test_split_custom_func(
return group_values, splits


def split_x_y_arrays(response: str, df_train_test: DataFrame) -> List:
def split_x_y_arrays(response: list[str], df_train_test: DataFrame) -> List:
"""Split `DataFrame` from `train_test_split_custom_func()` into the train/test X/y arrays based on `train_test` column.

These arrays can then be used with sklearn model objects for ML workflow.
"""
response = [response] if isinstance(response, str) else response
df_train = df_train_test.loc[df_train_test["train_test"] == "train"].drop(
columns=["train_test"]
)
df_test = df_train_test.loc[df_train_test["train_test"] == "test"].drop(
columns=["train_test"]
)

x_train = df_train.drop([response], axis=1).to_numpy()
x_train = df_train.drop(response, axis=1).to_numpy()
y_train = df_train[response].to_numpy()
x_test = df_test.drop([response], axis=1).to_numpy()
x_test = df_test.drop(response, axis=1).to_numpy()
y_test = df_test[response].to_numpy()

return x_train, y_train, x_test, y_test
Expand Down
Loading