In [1]:
# importing necessary libraries
import json
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
json_file = 'algoparams_from_ui.json'
csv_file = 'iris.csv'

In [3]:
def parse_json(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
        print("Parsed JSON Data:", data)
        if data:  # Check if data is not empty
            print("Data successfully loaded!")
        else:
            print("Data is empty.")
        return data

In [4]:
# 2. Load and Prepare Data
def load_data(csv_file, target_column):
    data = pd.read_csv(csv_file)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y


In [5]:
# 3. Feature Handling
# Ensure target_column is not included in feature handling
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
def get_feature_handling_steps(config, X, target_column):
    steps = []
    transformers = []

    for feature, details in config['feature_handling'].items():
        if feature == target_column:
            continue  # Skip the target column

        if details['is_selected']:
            if details['feature_variable_type'] == 'numerical':
                impute_strategy = 'mean' if details['feature_details']['impute_with'] == 'Average of values' else 'constant'
                impute_value = details['feature_details'].get('impute_value', 0)
                transformers.append(
                    (f"{feature}_imputer", SimpleImputer(strategy=impute_strategy, fill_value=impute_value), [feature])
                )
            elif details['feature_variable_type'] == 'text':
                transformers.append(
                    (f"{feature}_encoder", OneHotEncoder(handle_unknown='ignore'), [feature])
                )

    steps.append(('preprocessor', ColumnTransformer(transformers)))
    return steps


In [6]:
# 4. Feature Reduction
from sklearn.feature_selection import SelectFromModel
def get_feature_reduction_steps(config):
    reduction_method = config['feature_reduction']['feature_reduction_method']
    steps = []

    if reduction_method == 'Tree-based':
        steps.append(('feature_selection', SelectFromModel(RandomForestRegressor(n_estimators=100))))
    elif reduction_method == 'PCA':
        from sklearn.decomposition import PCA
        steps.append(('pca', PCA(n_components=int(config['feature_reduction'].get('num_of_features_to_keep', 5)))))
    return steps


In [7]:
# 5. Model Selection
def get_model_and_hyperparams(config):
    models = []
    algorithms = config['algorithms']

    for model_name, model_conf in algorithms.items():
        if model_conf['is_selected']:
            if model_name == 'RandomForestRegressor':
                model = RandomForestRegressor()
                hyperparams = {
                    'model__n_estimators': [model_conf['min_trees'], model_conf['max_trees']],
                    'model__max_depth': [model_conf['min_depth'], model_conf['max_depth']]
                }
                models.append((model_name, model, hyperparams))
    return models

In [8]:
# 6. Build and Run Pipeline
def build_and_run_pipeline(X, y, config):
    feature_handling_steps = get_feature_handling_steps(config['design_state_data'], X, target_column)
    feature_reduction_steps = get_feature_reduction_steps(config['design_state_data'])
    models = get_model_and_hyperparams(config['design_state_data'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model, hyperparams in models:
        print(f"\nTraining model: {model_name}")

        steps = feature_handling_steps + feature_reduction_steps + [('model', model)]
        pipeline = Pipeline(steps)

        if hyperparams:
            pipeline = GridSearchCV(pipeline, param_grid=hyperparams, cv=5)

        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)

        print("Mean Squared Error:", mean_squared_error(y_test, predictions))
        print("R2 Score:", r2_score(y_test, predictions))
        if hasattr(pipeline, 'best_params_'):
            print("Best Hyperparameters:", pipeline.best_params_)


In [9]:
# 7. Main Execution
if __name__ == '__main__':
    json_file = 'algoparams_from_ui.json'
    csv_file = 'iris.csv'

    config = parse_json(json_file)
    target_column = config['design_state_data']['target']['target']

    X, y = load_data(csv_file, target_column)
    build_and_run_pipeline(X, y, config)

Parsed JSON Data: {'session_name': 'test', 'session_description': 'test', 'design_state_data': {'session_info': {'project_id': '1', 'experiment_id': 'kkkk-11', 'dataset': 'iris.csv', 'session_name': 'test', 'session_description': 'test'}, 'target': {'prediction_type': 'Regression', 'target': 'petal_width', 'type': 'regression', 'partitioning': True}, 'train': {'policy': 'Split the dataset', 'time_variable': 'sepal_length', 'sampling_method': 'No sampling(whole data)', 'split': 'Randomly', 'k_fold': False, 'train_ratio': 0, 'random_seed': 0}, 'metrics': {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}, 'feature_handling': {'sepal_length': {'feature_name': 'sepal_length', 'is_selected': True, 'feature_variable_t