Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion mambular/arch_utils/layer_utils/embedding_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,10 @@ def forward(self, num_features, cat_features, emb_features):
# Process categorical embeddings
if self.cat_embeddings and cat_features is not None:
cat_embeddings = [
emb(cat_features[i]) for i, emb in enumerate(self.cat_embeddings)
emb(cat_features[i]) if emb(cat_features[i]).ndim == 3 else emb(cat_features[i]).unsqueeze(1)
for i, emb in enumerate(self.cat_embeddings)
]

cat_embeddings = torch.stack(cat_embeddings, dim=1)
cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
if self.layer_norm_after_embedding:
Expand Down Expand Up @@ -189,6 +191,7 @@ def forward(self, num_features, cat_features, emb_features):
]
emb_embeddings = torch.stack(emb_embeddings, dim=1)
else:

emb_embeddings = torch.stack(emb_features, dim=1)
if self.layer_norm_after_embedding:
emb_embeddings = self.embedding_norm(emb_embeddings)
Expand All @@ -199,6 +202,7 @@ def forward(self, num_features, cat_features, emb_features):

if embeddings:
x = torch.cat(embeddings, dim=1) if len(embeddings) > 1 else embeddings[0]

else:
raise ValueError("No features provided to the model.")

Expand Down
59 changes: 40 additions & 19 deletions mambular/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ class Preprocessor:

Parameters
----------
feature_preprocessing: dict or None
Dictionary mapping column names to preprocessing techniques. Example:
{
"num_feature1": "minmax",
"num_feature2": "ple",
"cat_feature1": "one-hot",
"cat_feature2": "int"
}
n_bins : int, default=50
The number of bins to use for numerical feature binning. This parameter is relevant
only if `numerical_preprocessing` is set to 'binning', 'ple' or 'one-hot'.
Expand Down Expand Up @@ -94,6 +102,7 @@ class Preprocessor:

def __init__(
self,
feature_preprocessing=None,
n_bins=64,
numerical_preprocessing="ple",
categorical_preprocessing="int",
Expand Down Expand Up @@ -153,6 +162,7 @@ def __init__(
)

self.use_decision_tree_bins = use_decision_tree_bins
self.feature_preprocessing = feature_preprocessing or {}
self.column_transformer = None
self.fitted = False
self.binning_strategy = binning_strategy
Expand Down Expand Up @@ -300,6 +310,10 @@ def fit(self, X, y=None, embeddings=None):

if numerical_features:
for feature in numerical_features:
feature_preprocessing = self.feature_preprocessing.get(
feature, self.numerical_preprocessing
)

# extended the annotation list if new transformer is added, either from sklearn or custom
numeric_transformer_steps: list[
tuple[
Expand All @@ -322,7 +336,7 @@ def fit(self, X, y=None, embeddings=None):
| SigmoidExpansion,
]
] = [("imputer", SimpleImputer(strategy="mean"))]
if self.numerical_preprocessing in ["binning", "one-hot"]:
if feature_preprocessing in ["binning", "one-hot"]:
bins = (
self._get_decision_tree_bins(X[[feature]], y, [feature])
if self.use_decision_tree_bins
Expand Down Expand Up @@ -356,22 +370,22 @@ def fit(self, X, y=None, embeddings=None):
]
)

if self.numerical_preprocessing == "one-hot":
if feature_preprocessing == "one-hot":
numeric_transformer_steps.extend(
[
("onehot_from_ordinal", OneHotFromOrdinal()),
]
)

elif self.numerical_preprocessing == "standardization":
elif feature_preprocessing == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))

elif self.numerical_preprocessing == "minmax":
elif feature_preprocessing == "minmax":
numeric_transformer_steps.append(
("minmax", MinMaxScaler(feature_range=(-1, 1)))
)

elif self.numerical_preprocessing == "quantile":
elif feature_preprocessing == "quantile":
numeric_transformer_steps.append(
(
"quantile",
Expand All @@ -381,7 +395,7 @@ def fit(self, X, y=None, embeddings=None):
)
)

elif self.numerical_preprocessing == "polynomial":
elif feature_preprocessing == "polynomial":
if self.scaling_strategy == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))
elif self.scaling_strategy == "minmax":
Expand All @@ -395,10 +409,10 @@ def fit(self, X, y=None, embeddings=None):
)
)

elif self.numerical_preprocessing == "robust":
elif feature_preprocessing == "robust":
numeric_transformer_steps.append(("robust", RobustScaler()))

elif self.numerical_preprocessing == "splines":
elif feature_preprocessing == "splines":
if self.scaling_strategy == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))
elif self.scaling_strategy == "minmax":
Expand All @@ -419,7 +433,7 @@ def fit(self, X, y=None, embeddings=None):
),
)

elif self.numerical_preprocessing == "rbf":
elif feature_preprocessing == "rbf":
if self.scaling_strategy == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))
elif self.scaling_strategy == "minmax":
Expand All @@ -438,7 +452,7 @@ def fit(self, X, y=None, embeddings=None):
)
)

elif self.numerical_preprocessing == "sigmoid":
elif feature_preprocessing == "sigmoid":
if self.scaling_strategy == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))
elif self.scaling_strategy == "minmax":
Expand All @@ -457,15 +471,19 @@ def fit(self, X, y=None, embeddings=None):
)
)

elif self.numerical_preprocessing == "ple":

elif feature_preprocessing == "ple":
numeric_transformer_steps.append(
("minmax", MinMaxScaler(feature_range=(-1, 1)))
)
numeric_transformer_steps.append(
("ple", PLE(n_bins=self.n_bins, task=self.task))
)

elif self.numerical_preprocessing == "box-cox":
elif feature_preprocessing == "box-cox":
numeric_transformer_steps.append(
("minmax", MinMaxScaler(feature_range=(1e-03, 1)))
)
numeric_transformer_steps.append(
("check_positive", MinMaxScaler(feature_range=(1e-3, 1)))
)
Expand All @@ -476,15 +494,15 @@ def fit(self, X, y=None, embeddings=None):
)
)

elif self.numerical_preprocessing == "yeo-johnson":
elif feature_preprocessing == "yeo-johnson":
numeric_transformer_steps.append(
(
"yeo-johnson",
PowerTransformer(method="yeo-johnson", standardize=True),
)
)

elif self.numerical_preprocessing == "none":
elif feature_preprocessing == "none":
numeric_transformer_steps.append(
(
"none",
Expand All @@ -498,15 +516,18 @@ def fit(self, X, y=None, embeddings=None):

if categorical_features:
for feature in categorical_features:
if self.categorical_preprocessing == "int":
feature_preprocessing = self.feature_preprocessing.get(
feature, self.categorical_preprocessing
)
if feature_preprocessing == "int":
# Use ContinuousOrdinalEncoder for "int"
categorical_transformer = Pipeline(
[
("imputer", SimpleImputer(strategy="most_frequent")),
("continuous_ordinal", ContinuousOrdinalEncoder()),
]
)
elif self.categorical_preprocessing == "one-hot":
elif feature_preprocessing == "one-hot":
# Use OneHotEncoder for "one-hot"
categorical_transformer = Pipeline(
[
Expand All @@ -516,15 +537,15 @@ def fit(self, X, y=None, embeddings=None):
]
)

elif self.categorical_preprocessing == "none":
elif feature_preprocessing == "none":
# Use OneHotEncoder for "one-hot"
categorical_transformer = Pipeline(
[
("imputer", SimpleImputer(strategy="most_frequent")),
("none", NoTransformer()),
]
)
elif self.categorical_preprocessing == "pretrained":
elif feature_preprocessing == "pretrained":
categorical_transformer = Pipeline(
[
("imputer", SimpleImputer(strategy="most_frequent")),
Expand All @@ -533,7 +554,7 @@ def fit(self, X, y=None, embeddings=None):
)
else:
raise ValueError(
f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}"
f"Unknown categorical_preprocessing type: {feature_preprocessing}"
)

# Append the transformer for the current categorical feature
Expand Down
Loading