generated from VectorInstitute/aieng-template-uv
-
Notifications
You must be signed in to change notification settings - Fork 1
Addressing Normalization TODO in clustering.py #81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
8c82c2e
First commit addressing a few todos in the code. This includes a fair…
emersodb 17944bb
Small cleanup
emersodb 5f024b0
Addressing some coderabbit comments
emersodb 72d5a3f
Addressing some comments from Behnoosh
emersodb 1cebf9a
A few more PR comments
emersodb 5e0edac
Addressing normalization todo comment
emersodb ed705bb
Some small fixes
emersodb ba76f21
Merge branch 'main' into dbe/more_trainer_todos
emersodb b3e1d10
Merge branch 'dbe/more_trainer_todos' into dbe/clustering_todo
emersodb 7ac7cd8
Merge branch 'main' into dbe/clustering_todo
emersodb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,8 +18,8 @@ | |
| from midst_toolkit.models.clavaddpm.enumerations import ( | ||
| ClusteringMethod, | ||
| Configs, | ||
| DataAndKeyNormalizationType, | ||
| GroupLengthsProbDicts, | ||
| KeyScalingType, | ||
| RelationOrder, | ||
| Tables, | ||
| ) | ||
|
|
@@ -163,6 +163,7 @@ def _pair_clustering( | |
| num_clusters: int, | ||
| parent_scale: float, | ||
| key_scale: float, | ||
| data_and_key_normalization: DataAndKeyNormalizationType = DataAndKeyNormalizationType.MINMAX, | ||
| clustering_method: ClusteringMethod = ClusteringMethod.KMEANS, | ||
| ) -> tuple[pd.DataFrame, pd.DataFrame, dict[int, dict[int, float]]]: | ||
| """ | ||
|
|
@@ -175,11 +176,12 @@ def _pair_clustering( | |
| parent_name: Name of the parent table. | ||
| child_name: Name of the child table. | ||
| num_clusters: Number of clusters. | ||
| parent_scale: Scaling factor applied to the parent table, provided by the config. | ||
| It will be applied to the features to weight their importance during clustering. | ||
| key_scale: Scaling factor applied to the foreign key values that link | ||
| the child table to the parent table. This will weight how much influence | ||
| the parent-child relationship has in the clustering algorithm. | ||
| parent_scale: Scaling factor applied to the parent table, provided by the config. It will be applied to the | ||
| features to weight their importance during clustering. | ||
| key_scale: Scaling factor applied to the foreign key values that link the child table to the parent table. | ||
| This will weight how much influence the parent-child relationship has in the clustering algorithm. | ||
| data_and_key_normalization: Type of normalization for the child and parent data and keys. Default is | ||
| ``DataAndKeyNormalizationType.MINMAX.`` | ||
| clustering_method: Method of clustering. Default is ClusteringMethod.KMEANS. | ||
|
|
||
| Returns: | ||
|
|
@@ -226,6 +228,7 @@ def _pair_clustering( | |
| parent_primary_key, | ||
| parent_scale, | ||
| key_scale, | ||
| data_and_key_normalization, | ||
| ) | ||
|
|
||
| cluster_labels = _get_cluster_labels(cluster_data, clustering_method, num_clusters) | ||
|
|
@@ -333,35 +336,42 @@ def _merge_parent_data_with_child_data( | |
| return merged_parent_data | ||
|
|
||
|
|
||
| def _get_min_max_and_quantile_for_numerical_columns( | ||
| def get_normalized_numerical_columns( | ||
| child_numerical_data: np.ndarray, | ||
| parent_numerical_data: np.ndarray, | ||
| parent_scale: float, | ||
| ) -> tuple[np.ndarray, np.ndarray]: | ||
| normalization_method: DataAndKeyNormalizationType = DataAndKeyNormalizationType.MINMAX, | ||
| ) -> np.ndarray: | ||
| """ | ||
| Get the min-max and quantile values for the numerical columns in both the | ||
| child and parent data. | ||
| The child and parent table numerical data are merged and then normalized together according to the normalization | ||
| scheme specified by ``normalization_method``. After normalization, data in the parent numerical data is scaled | ||
| by the ``parent_scale`` float. | ||
|
|
||
| Args: | ||
| child_numerical_data: Numpy array of the child numerical data. | ||
| parent_numerical_data: Numpy array of the parent numerical data. | ||
| parent_scale: Scaling factor applied to the parent data. | ||
| child_numerical_data: Numpy array of the child table numerical data. | ||
| parent_numerical_data: Numpy array of the parent table numerical data. | ||
| parent_scale: Scaling factor applied to the parent data AFTER normalization. | ||
| normalization_method: The approach to be used to normalized the combined data. Defaults to | ||
| DataAndKeyNormalizationType.MINMAX. | ||
|
|
||
| Returns: | ||
| A tuple with two numpy arrays, one with the min-max values and one with the quantile | ||
| values for the numerical columns. | ||
| A numpy array containing the merged child and parent table (in that order) numerical data, normalized using | ||
| the specified strategy and while child data scaled by the provided ``parent_scale`` | ||
| """ | ||
| joint_matrix = np.concatenate([child_numerical_data, parent_numerical_data], axis=1) | ||
| matrix_p_index = child_numerical_data.shape[1] | ||
| parent_start_index = child_numerical_data.shape[1] | ||
|
|
||
| # Perform quantile normalization using QuantileTransformer | ||
| numerical_quantile = _quantile_normalize_sklearn(joint_matrix) | ||
| numerical_min_max = _min_max_normalize_sklearn(joint_matrix) | ||
| if normalization_method == DataAndKeyNormalizationType.MINMAX: | ||
| normalized_data = _min_max_normalize_sklearn(joint_matrix) | ||
| elif normalization_method == DataAndKeyNormalizationType.QUANTILE: | ||
| normalized_data = _quantile_normalize_sklearn(joint_matrix) | ||
| else: | ||
| raise ValueError(f"Unrecognized Normalization Method: {normalization_method}") | ||
|
|
||
| numerical_quantile[:, matrix_p_index:] = parent_scale * numerical_quantile[:, matrix_p_index:] | ||
| numerical_min_max[:, matrix_p_index:] = parent_scale * numerical_min_max[:, matrix_p_index:] | ||
| # Scale the parent data using the parent scale value | ||
| normalized_data[:, parent_start_index:] = parent_scale * normalized_data[:, parent_start_index:] | ||
|
|
||
| return numerical_min_max, numerical_quantile | ||
| return normalized_data | ||
|
|
||
|
|
||
| def _one_hot_encode_categorical_columns( | ||
|
|
@@ -422,28 +432,28 @@ def _prepare_cluster_data( | |
| parent_primary_key: str, | ||
| parent_scale: float, | ||
| key_scale: float, | ||
| key_scaling_type: KeyScalingType = KeyScalingType.MINMAX, | ||
| data_and_key_normalization: DataAndKeyNormalizationType = DataAndKeyNormalizationType.MINMAX, | ||
| ) -> np.ndarray: | ||
| """ | ||
| Prepare the data for the clustering algorithm, which comprises of merging the parent | ||
| and child data, splitting the data into categorical and numerical columns, and | ||
| normalizing the data. | ||
| Prepare the data for the clustering algorithm, which comprises of merging the parent and child data, splitting | ||
| the data into categorical and numerical columns, and normalizing the data. | ||
|
|
||
| Args: | ||
| child_data: Numpy array of the child data. | ||
| parent_data: Numpy array of the parent data. | ||
| child_domain: Dictionary of the domain of the child table. The domain dictionary | ||
| holds metadata about the columns of each one of the tables. | ||
| parent_domain: Dictionary of the domain of the parent table. The domain dictionary | ||
| holds metadata about the columns of each one of the tables. | ||
| child_domain: Dictionary of the domain of the child table. The domain dictionary holds metadata about the | ||
| columns of each one of the tables. | ||
| parent_domain: Dictionary of the domain of the parent table. The domain dictionary holds metadata about the | ||
| columns of each one of the tables. | ||
| all_child_columns: List of all child columns. | ||
| all_parent_columns: List of all parent columns. | ||
| parent_primary_key: Name of the parent primary key. | ||
| parent_scale: Scaling factor applied to the parent table, provided by the config. | ||
| It will be applied to the features to weight their importance during clustering. | ||
| key_scale: Scaling factor applied to the tables' keys. This will weight how much influence | ||
| the parent-child relationship has in the clustering algorithm. | ||
| key_scaling_type: Type of scaling for the tables' keys. Default is KeyScalingType.MINMAX. | ||
| parent_scale: Scaling factor applied to the parent table, provided by the config. It will be applied to the | ||
| features to weight their importance during clustering. | ||
| key_scale: Scaling factor applied to the tables' keys. This will weight how much influence the parent-child | ||
| relationship has in the clustering algorithm. | ||
| data_and_key_normalization: Type of normalization for the child and parent data and keys. Default is | ||
| ``DataAndKeyNormalizationType.MINMAX.`` | ||
|
|
||
| Returns: | ||
| Numpy array of the data prepared for the clustering algorithm. | ||
|
|
@@ -475,21 +485,21 @@ def _prepare_cluster_data( | |
| parent_numerical_data = merged_data[:, parent_numerical_columns] | ||
| parent_categorical_data = merged_data[:, parent_categorical_columns] | ||
|
|
||
| numerical_min_max, numerical_quantile = _get_min_max_and_quantile_for_numerical_columns( | ||
| numerical_normalized = get_normalized_numerical_columns( | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| child_numerical_data, | ||
| parent_numerical_data, | ||
| parent_scale, | ||
| data_and_key_normalization, | ||
| ) | ||
|
|
||
| reshaped_parent_data = merged_data[:, parent_primary_key_index].reshape(-1, 1) | ||
| if key_scaling_type == KeyScalingType.MINMAX: | ||
| key_normalized = _min_max_normalize_sklearn(reshaped_parent_data) | ||
| numerical_normalized = numerical_min_max | ||
| elif key_scaling_type == KeyScalingType.QUANTILE: | ||
| key_normalized = _quantile_normalize_sklearn(reshaped_parent_data) | ||
| numerical_normalized = numerical_quantile | ||
| # Normalizing the parent table primary key data. | ||
| reshaped_parent_primary_key_data = merged_data[:, parent_primary_key_index].reshape(-1, 1) | ||
| if data_and_key_normalization == DataAndKeyNormalizationType.MINMAX: | ||
| key_normalized = _min_max_normalize_sklearn(reshaped_parent_primary_key_data) | ||
| elif data_and_key_normalization == DataAndKeyNormalizationType.QUANTILE: | ||
| key_normalized = _quantile_normalize_sklearn(reshaped_parent_primary_key_data) | ||
| else: | ||
| raise ValueError(f"Unsupported foreign key scaling type: {key_scaling_type}") | ||
| raise ValueError(f"Unsupported data and key normalization type: {data_and_key_normalization}") | ||
|
|
||
| key_scaled = key_scale * key_normalized | ||
|
|
||
|
|
@@ -727,9 +737,6 @@ def _parse_numpy_number_as_int(number: np.number) -> int: | |
| raise ValueError(f"Number is not a number: {item}") | ||
|
|
||
|
|
||
| # TODO: Refactor the functions below to be a single one with a "method" parameter. | ||
|
|
||
|
|
||
| def _quantile_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: | ||
| """ | ||
| Quantile normalize the input matrix using Sklearn's QuantileTransformer. | ||
|
|
@@ -745,15 +752,7 @@ def _quantile_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: | |
| random_state=42, # TODO: do we really need to hardcode the random state? | ||
| ) # Change output_distribution as needed | ||
|
|
||
| normalized_data = np.empty((matrix.shape[0], 0)) | ||
|
|
||
| # Apply QuantileTransformer to each column and concatenate the results | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Literally no idea why this was being done per column... |
||
| for col in range(matrix.shape[1]): | ||
| column = matrix[:, col].reshape(-1, 1) | ||
| transformed_column = transformer.fit_transform(column) | ||
| normalized_data = np.concatenate((normalized_data, transformed_column), axis=1) | ||
|
|
||
| return normalized_data | ||
| return transformer.fit_transform(matrix) | ||
|
|
||
|
|
||
| def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: | ||
|
|
@@ -767,16 +766,7 @@ def _min_max_normalize_sklearn(matrix: np.ndarray) -> np.ndarray: | |
| Numpy array of the normalized data. | ||
| """ | ||
| scaler = MinMaxScaler(feature_range=(-1, 1)) | ||
|
|
||
| normalized_data = np.empty((matrix.shape[0], 0)) | ||
|
|
||
| # Apply MinMaxScaler to each column and concatenate the results | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Literally no idea why this was being done per column... |
||
| for col in range(matrix.shape[1]): | ||
| column = matrix[:, col].reshape(-1, 1) | ||
| transformed_column = scaler.fit_transform(column) | ||
| normalized_data = np.concatenate((normalized_data, transformed_column), axis=1) | ||
|
|
||
| return normalized_data | ||
| return scaler.fit_transform(matrix) | ||
|
|
||
|
|
||
| def _aggregate_and_sample( | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| import numpy as np | ||
|
|
||
| from midst_toolkit.common.random import set_all_random_seeds, unset_all_random_seeds | ||
| from midst_toolkit.models.clavaddpm.clustering import ( | ||
| _min_max_normalize_sklearn, | ||
| _quantile_normalize_sklearn, | ||
| get_normalized_numerical_columns, | ||
| ) | ||
| from midst_toolkit.models.clavaddpm.enumerations import DataAndKeyNormalizationType | ||
|
|
||
|
|
||
| def test_quantile_normalize_sklearn() -> None: | ||
| set_all_random_seeds(42) | ||
| data_to_normalize = np.random.randint(0, 3, (5, 5)) | ||
| normalized_data = _quantile_normalize_sklearn(data_to_normalize) | ||
| assert np.allclose( | ||
| normalized_data, | ||
| np.array( | ||
| [ | ||
| [5.19933758, -5.19933758, 5.19933758, 5.19933758, -5.19933758], | ||
| [-5.19933758, 5.19933758, 0.0, 5.19933758, 5.19933758], | ||
| [5.19933758, 5.19933758, -5.19933758, 5.19933758, 0.31863936], | ||
| [-5.19933758, 0.0, 0.0, -5.19933758, 0.31863936], | ||
| [-5.19933758, -5.19933758, 0.0, -5.19933758, -5.19933758], | ||
| ] | ||
| ), | ||
| atol=1e-5, | ||
| ) | ||
| unset_all_random_seeds() | ||
|
|
||
|
|
||
| def test_min_max_normalize_sklearn() -> None: | ||
| set_all_random_seeds(42) | ||
| data_to_normalize = np.random.randint(0, 3, (5, 5)) | ||
| normalized_data = _min_max_normalize_sklearn(data_to_normalize) | ||
| assert np.allclose( | ||
| normalized_data, | ||
| np.array( | ||
| [ | ||
| [1.0, -1.0, 1.0, 1.0, -1.0], | ||
| [-1.0, 1.0, 0.0, 1.0, 1.0], | ||
| [1.0, 1.0, -1.0, 1.0, 0.0], | ||
| [-1.0, 0.0, 0.0, -1.0, 0.0], | ||
| [-1.0, -1.0, 0.0, -1.0, -1.0], | ||
| ] | ||
| ), | ||
| atol=1e-8, | ||
| ) | ||
| unset_all_random_seeds() | ||
|
|
||
|
|
||
| def test_get_normalized_numerical_columns() -> None: | ||
| set_all_random_seeds(42) | ||
| child_data = np.random.randint(0, 3, (3, 3)) | ||
| parent_data = np.random.randint(0, 3, (3, 3)) | ||
| scale = 2.0 | ||
| normalization_type = DataAndKeyNormalizationType.MINMAX | ||
|
|
||
| normalized_data = get_normalized_numerical_columns(child_data, parent_data, scale, normalization_type) | ||
| assert np.allclose( | ||
| normalized_data, | ||
| np.array( | ||
| [[-1.0, -1.0, 1.0, 2.0, 2.0, 2.0], [-1.0, -1.0, -1.0, -2.0, 2.0, -2.0], [-1.0, 1.0, 1.0, -2.0, -2.0, -2.0]] | ||
| ), | ||
| atol=1e-6, | ||
| ) | ||
|
|
||
| normalization_type = DataAndKeyNormalizationType.QUANTILE | ||
| normalized_data = get_normalized_numerical_columns(child_data, parent_data, scale, normalization_type) | ||
|
|
||
| assert np.allclose( | ||
| normalized_data, | ||
| np.array( | ||
| [ | ||
| [-5.19933758, -5.19933758, 5.19933758, 2 * 5.19933758, 2 * 5.19933758, 2 * 5.19933758], | ||
| [-5.19933758, -5.19933758, -5.19933758, 2 * -5.19933758, 2 * 5.19933758, 2 * -5.19933758], | ||
| [-5.19933758, 5.19933758, 5.19933758, 2 * -5.19933758, 2 * -5.19933758, 2 * -5.19933758], | ||
| ] | ||
| ), | ||
| atol=1e-5, | ||
| ) | ||
|
|
||
| unset_all_random_seeds() |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.