In [199]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import os

In [200]:
# Load JSON input
with open('input.json', 'r') as f:
    input_data = json.load(f)

In [201]:
# Load dataset
dataset_path = input_data['design_state_data']['session_info']['dataset']
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)
else:
    raise FileExistsError("The Specified CSV not found")

In [202]:
# Separate features and target
target_column = input_data['design_state_data']['target']['target']
X = data
y = data[target_column]

In [203]:
target_column

'petal_width'

## Feature Handling

Process Features One by One since What to with them can vary

In [204]:
all_features_to_handle = input_data['design_state_data']['feature_handling']

In [205]:
def impute_numericals(strategy, value):
    if strategy == "Average of values":
        strategy = "mean"
        imputer = SimpleImputer(strategy=strategy )
    else:
        strategy = "constant"
        imputer = SimpleImputer(strategy=strategy ,  fill_value=value)
    return imputer

def scale_numericals(rescaling):
    if rescaling == "No rescaling":
        return None
    else:
        return StandardScaler()

def hash_text(columns):
    if columns == 0:
        columns = 1
    return  HashingVectorizer(n_features=columns, alternate_sign=False, norm=None)
def iterate_features_and_handle(all_features_to_handle, X):
    numeric_features = []
    numeric_transformers = []
    categorical_features = []
    categorical_transformers = []
    for feature, details in all_features_to_handle.items():
        #Purposfully avoiding to process The TargetVariable not sure, why it was added to the Handle Features Json
        if details['is_selected'] :
            #classify it as categorical or Numerical
            if details['feature_variable_type'] == 'numerical':
                numeric_features.append(feature)
                imputer = impute_numericals(strategy=details['feature_details']['impute_with'], value = details['feature_details']['impute_value'])
                scaler = scale_numericals(rescaling = details['feature_details']['rescaling'])
                transformers = [('imputer', imputer)]
                numeric_transformers.append((feature, Pipeline(transformers)))
            elif details['feature_variable_type'] == 'text':
                categorical_features.append(feature)
                text_vectorizer = hash_text(details['feature_details']['hash_columns'])
                categorical_transformers.append((feature, text_vectorizer, feature))
    preprocessor = ColumnTransformer(
        transformers=[
        ('numeric', Pipeline(numeric_transformers), numeric_features),
        ('categorical', ColumnTransformer(transformers=categorical_transformers), categorical_features)])    
    columns = X.columns

    return  pd.DataFrame(preprocessor.fit_transform(X), columns=columns)



X = iterate_features_and_handle(all_features_to_handle, X)

## Feature Generation

In [206]:
features_to_generate = input_data['design_state_data']['feature_generation']


In [207]:
def generate_features(dataset, feature_generation):
    # Linear interactions
    linear_interactions = feature_generation.get("linear_interactions", [])
    for interaction in linear_interactions:
        dataset[f"{interaction[0]}_{interaction[1]}"] = dataset[interaction[0]] * dataset[interaction[1]]

    # Polynomial interactions
    polynomial_interactions = feature_generation.get("polynomial_interactions", [])
    poly = PolynomialFeatures(include_bias=False)
    for interaction in polynomial_interactions:
        interaction_split = interaction.split("/")
        transformed = poly.fit_transform(dataset[[interaction_split[0], interaction_split[1]]])
        for i in range(transformed.shape[1]):
            dataset[f"poly_{interaction_split[0]}_{interaction_split[1]}_{i}"] = transformed[:, i]

    # Explicit pairwise interactions
    explicit_pairwise_interactions = feature_generation.get("explicit_pairwise_interactions", [])
    for interaction in explicit_pairwise_interactions:
        interaction_split = interaction.split("/")
        dataset[f"{interaction_split[0]}_{interaction_split[1]}"] = dataset[interaction_split[0]] * dataset[interaction_split[1]]

    return dataset

x_generated = generate_features(X,features_to_generate )

## Feature Reduction

In [209]:
feature_reduction_json  = input_data['design_state_data']['feature_reduction']

In [214]:
def reduce_features(dataset, config, target_variable):
    if config["feature_reduction_method"] == "Tree-based":
        num_of_features_to_keep = int(config["num_of_features_to_keep"])
        num_of_trees = int(config["num_of_trees"])
        depth_of_trees = int(config["depth_of_trees"])
        
        # Select features and target variable
        X = dataset.drop(columns=[target_variable])
        y = dataset[target_variable]
        
        # Initialize Random Forest Regressor
        rf = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees, random_state=42)
        
        # Fit Random Forest model
        rf.fit(X, y)
        
        # Get feature importances
        feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
        
        # Select top k features
        top_features = feature_importances.nlargest(num_of_features_to_keep).index.tolist()
        
        # Update dataset with selected features
        dataset = dataset[top_features + [target_variable]]
        
        return dataset
    else:
        print("Unsupported feature reduction method. Please choose 'Tree-based'.")

reduced_features = reduce_features(x_generated, feature_reduction_json, target_column)

In [215]:
reduced_features

Unnamed: 0,poly_petal_length_sepal_width_0,petal_width_sepal_length,poly_petal_width_species_0,poly_petal_width_species_3,petal_width
0,1.4,1.02,0.2,0.4,0.2
1,1.4,0.98,0.2,0.4,0.2
2,1.3,0.94,0.2,0.4,0.2
3,1.5,0.92,0.2,0.4,0.2
4,1.4,1.00,0.2,0.4,0.2
...,...,...,...,...,...
145,5.2,15.41,2.3,4.6,2.3
146,5.0,11.97,1.9,3.8,1.9
147,5.2,13.00,2.0,4.0,2.0
148,5.4,14.26,2.3,4.6,2.3
