In [30]:
import pandas as pd
import numpy as np
import json

In [38]:
# data = pd.read_json('myjson.json')
df = pd.read_csv(config["design_state_data"]["session_info"]["dataset"])

In [43]:
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
108,6.7,2.5,5.8,1.8,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
19,5.1,3.8,1.5,0.3,Iris-setosa
37,4.9,3.1,1.5,0.1,Iris-setosa
26,5.0,3.4,1.6,0.4,Iris-setosa
78,6.0,2.9,4.5,1.5,Iris-versicolor
41,4.5,2.3,1.3,0.3,Iris-setosa
135,7.7,3.0,6.1,2.3,Iris-virginica
9,4.9,3.1,1.5,0.1,Iris-setosa
38,4.4,3.0,1.3,0.2,Iris-setosa


In [39]:
with open("myjson.json", "r") as f:
    config = json.load(f)


In [40]:
target_column = config["design_state_data"]["target"]["target"]
prediction_type = config["design_state_data"]["target"]["prediction_type"]  # "Regression" or "Classification"
print(target_column)
print(prediction_type)

petal_width
Regression


In [44]:
def apply_missing_value_imputation(df, feature_handling_json):
    for feature_name, feature_info in feature_handling_json.items():
        if feature_info["is_selected"]:
            details = feature_info["feature_details"]
            if details.get("missing_values") == "Impute":
                impute_with = details.get("impute_with")
                impute_value = details.get("impute_value")

                if impute_with == "Average of values":
                    mean_val = df[feature_name].mean()
                    df[feature_name] = df[feature_name].fillna(mean_val)
                    print(f"[INFO] Imputed {feature_name} with mean value {mean_val}")
                elif impute_with == "custom":
                    df[feature_name] = df[feature_name].fillna(impute_value)
                    print(f"[INFO] Imputed {feature_name} with custom value {impute_value}")
    return df

In [48]:
feature_handling_json = config["design_state_data"]["feature_handling"]
print(json.dumps(feature_handling_json, indent=4))

{
    "sepal_length": {
        "feature_name": "sepal_length",
        "is_selected": true,
        "feature_variable_type": "numerical",
        "feature_details": {
            "numerical_handling": "Keep as regular numerical feature",
            "rescaling": "No rescaling",
            "make_derived_feats": false,
            "missing_values": "Impute",
            "impute_with": "Average of values",
            "impute_value": 0
        }
    },
    "sepal_width": {
        "feature_name": "sepal_width",
        "is_selected": true,
        "feature_variable_type": "numerical",
        "feature_details": {
            "numerical_handling": "Keep as regular numerical feature",
            "rescaling": "No rescaling",
            "make_derived_feats": false,
            "missing_values": "Impute",
            "impute_with": "custom",
            "impute_value": -1
        }
    },
    "petal_length": {
        "feature_name": "petal_length",
        "is_selected": true,
        "fe

In [46]:
df = apply_missing_value_imputation(df, feature_handling_json)

[INFO] Imputed sepal_length with mean value 5.843333333333334
[INFO] Imputed sepal_width with custom value -1
[INFO] Imputed petal_length with mean value 3.758666666666666
[INFO] Imputed petal_width with custom value -2


In [47]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [49]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin

In [51]:
class FeatureReducer(BaseEstimator, TransformerMixin):
        def __init__(self, method="No Reduction", num_features=None, num_trees=100, max_depth=None, target=None):
            self.method = method
            self.num_features = int(num_features) if num_features else None
            self.num_trees = int(num_trees)
            self.max_depth = int(max_depth) if max_depth else None
            self.target = target
            self.selected_features_ = None
            self.pca_ = None
            self.importances_ = None
    
        def fit(self, X, y=None):
            if self.method == "No Reduction":
                return self
            
            elif self.method == "PCA":
                self.pca_ = PCA(n_components=self.num_features)
                self.pca_.fit(X)
            
            elif self.method == "Corr with Target":
                corr = X.corrwith(y).abs()
                self.selected_features_ = corr.sort_values(ascending=False).head(self.num_features).index.tolist()
            
            elif self.method == "Tree-based":
                rf = RandomForestRegressor(n_estimators=self.num_trees, max_depth=self.max_depth, random_state=42)
                rf.fit(X, y)
                importances = rf.feature_importances_
                self.importances_ = dict(zip(X.columns, importances))
                self.selected_features_ = sorted(self.importances_, key=self.importances_.get, reverse=True)[:self.num_features]
            
            return 
        def transform(self, X):
            if self.method == "No Reduction":
                return X
            elif self.method == "PCA":
                return self.pca_.transform(X)
            elif self.method in ["Corr with Target", "Tree-based"]:
                return X[self.selected_features_]
            else:
                raise ValueError(f"Unknown feature reduction method: {self.method}")

In [53]:
reduction_cfg = config["design_state_data"]["feature_reduction"]

reducer = FeatureReducer(
    method=reduction_cfg["feature_reduction_method"],
    num_features=reduction_cfg.get("num_of_features_to_keep"),
    num_trees=reduction_cfg.get("num_of_trees"),
    max_depth=reduction_cfg.get("depth_of_trees"),
)