Skip to content

Commit e1f453f

Browse files
fix(KDP): adding FeatureSelection to Text and Date features (#28)
2 parents 5c3a974 + 47a6267 commit e1f453f

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

docs/imgs/complex_example.png

-49.9 KB
Loading

kdp/processor.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ class FeatureSelectionPlacementOptions(str, Enum):
6565
NONE = "none"
6666
NUMERIC = "numeric"
6767
CATEGORICAL = "categorical"
68+
TEXT = "text"
69+
DATE = "date"
6870
ALL_FEATURES = "all_features"
6971

7072

@@ -645,11 +647,6 @@ def _add_pipeline_numeric(
645647
# Check if distribution-aware encoding is enabled
646648
if self.use_distribution_aware:
647649
logger.info(f"Using distribution-aware encoding for {feature_name}")
648-
# Cast to float32 before distribution-aware encoding
649-
preprocessor.add_processing_step(
650-
layer_creator=PreprocessorLayerFactory.cast_to_float32_layer,
651-
name=f"pre_dist_cast_to_float_{feature_name}",
652-
)
653650
# Check if manually specified distribution is provided
654651
_prefered_distribution = _feature.kwargs.get("prefered_distribution")
655652
if _prefered_distribution is not None:
@@ -920,6 +917,22 @@ def _add_pipeline_text(self, feature_name: str, input_layer, stats: dict) -> Non
920917
)
921918
# Process the feature
922919
_output_pipeline = preprocessor.chain(input_layer=input_layer)
920+
921+
# Apply feature selection if enabled for categorical features
922+
if (
923+
self.feature_selection_placement == FeatureSelectionPlacementOptions.TEXT
924+
or self.feature_selection_placement
925+
== FeatureSelectionPlacementOptions.ALL_FEATURES
926+
):
927+
feature_selector = PreprocessorLayerFactory.variable_selection_layer(
928+
name=f"{feature_name}_feature_selection",
929+
nr_features=1, # Single feature for now
930+
units=self.feature_selection_units,
931+
dropout_rate=self.feature_selection_dropout,
932+
)
933+
_output_pipeline, feature_weights = feature_selector([_output_pipeline])
934+
self.processed_features[f"{feature_name}_weights"] = feature_weights
935+
923936
self.processed_features[feature_name] = _output_pipeline
924937

925938
@_monitor_performance
@@ -981,6 +994,22 @@ def _add_pipeline_date(self, feature_name: str, input_layer) -> None:
981994

982995
# Process the feature
983996
_output_pipeline = preprocessor.chain(input_layer=input_layer)
997+
998+
# Apply feature selection if enabled for categorical features
999+
if (
1000+
self.feature_selection_placement == FeatureSelectionPlacementOptions.DATE
1001+
or self.feature_selection_placement
1002+
== FeatureSelectionPlacementOptions.ALL_FEATURES
1003+
):
1004+
feature_selector = PreprocessorLayerFactory.variable_selection_layer(
1005+
name=f"{feature_name}_feature_selection",
1006+
nr_features=1, # Single feature for now
1007+
units=self.feature_selection_units,
1008+
dropout_rate=self.feature_selection_dropout,
1009+
)
1010+
_output_pipeline, feature_weights = feature_selector([_output_pipeline])
1011+
self.processed_features[f"{feature_name}_weights"] = feature_weights
1012+
9841013
self.processed_features[feature_name] = _output_pipeline
9851014

9861015
@_monitor_performance

0 commit comments

Comments
 (0)