In [25]:
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
import xgboost as xgb  
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import os

In [26]:
# Load JSON input
with open('input.json', 'r') as f:
    input_data = json.load(f)

In [27]:
def impute_numericals(strategy, value):
    if strategy == "Average of values":
        strategy = "mean"
        imputer = SimpleImputer(strategy=strategy )
    else:
        strategy = "constant"
        imputer = SimpleImputer(strategy=strategy ,  fill_value=value)
    return imputer

def scale_numericals(rescaling):
    if rescaling == "No rescaling":
        return None
    else:
        return StandardScaler()

def hash_text(columns):
    if columns == 0:
        columns = 1
    return  HashingVectorizer(n_features=columns, alternate_sign=False, norm=None)
def iterate_features_and_handle(all_features_to_handle, X):
    numeric_features = []
    numeric_transformers = []
    categorical_features = []
    categorical_transformers = []
    for feature, details in all_features_to_handle.items():
        #Purposfully avoiding to process The TargetVariable not sure, why it was added to the Handle Features Json
        if details['is_selected'] :
            #classify it as categorical or Numerical
            if details['feature_variable_type'] == 'numerical':
                numeric_features.append(feature)
                imputer = impute_numericals(strategy=details['feature_details']['impute_with'], value = details['feature_details']['impute_value'])
                scaler = scale_numericals(rescaling = details['feature_details']['rescaling'])
                transformers = [('imputer', imputer)]
                numeric_transformers.append((feature, Pipeline(transformers)))
            elif details['feature_variable_type'] == 'text':
                categorical_features.append(feature)
                text_vectorizer = hash_text(details['feature_details']['hash_columns'])
                categorical_transformers.append((feature, text_vectorizer, feature))
    preprocessor = ColumnTransformer(
        transformers=[
        ('numeric', Pipeline(numeric_transformers), numeric_features),
        ('categorical', ColumnTransformer(transformers=categorical_transformers), categorical_features)])    
    columns = X.columns

    return  pd.DataFrame(preprocessor.fit_transform(X), columns=columns)


def generate_features(dataset, feature_generation):
    # Linear interactions
    linear_interactions = feature_generation.get("linear_interactions", [])
    for interaction in linear_interactions:
        dataset[f"{interaction[0]}_{interaction[1]}"] = dataset[interaction[0]] * dataset[interaction[1]]

    # Polynomial interactions
    polynomial_interactions = feature_generation.get("polynomial_interactions", [])
    poly = PolynomialFeatures(include_bias=False)
    for interaction in polynomial_interactions:
        interaction_split = interaction.split("/")
        transformed = poly.fit_transform(dataset[[interaction_split[0], interaction_split[1]]])
        for i in range(transformed.shape[1]):
            dataset[f"poly_{interaction_split[0]}_{interaction_split[1]}_{i}"] = transformed[:, i]

    # Explicit pairwise interactions
    explicit_pairwise_interactions = feature_generation.get("explicit_pairwise_interactions", [])
    for interaction in explicit_pairwise_interactions:
        interaction_split = interaction.split("/")
        dataset[f"{interaction_split[0]}_{interaction_split[1]}"] = dataset[interaction_split[0]] * dataset[interaction_split[1]]

    return dataset

def reduce_features(dataset, config, target_variable):
    if config["feature_reduction_method"] == "Tree-based":
        num_of_features_to_keep = int(config["num_of_features_to_keep"])
        num_of_trees = int(config["num_of_trees"])
        depth_of_trees = int(config["depth_of_trees"])
        
        # Select features and target variable
        X = dataset.drop(columns=[target_variable])
        y = dataset[target_variable]
        
        # Initialize Random Forest Regressor
        rf = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees, random_state=42)
        
        # Fit Random Forest model
        rf.fit(X, y)
        
        # Get feature importances
        feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
        
        # Select top k features
        top_features = feature_importances.nlargest(num_of_features_to_keep).index.tolist()
        
        # Update dataset with selected features
        dataset = dataset[top_features + [target_variable]]
        
        return dataset
    else:
        print("Unsupported feature reduction method. Please choose 'Tree-based'.")


def partition_data(data, config):
    """
    Partition data based on the configuration specified in the JSON.
    
    Args:
    - data (DataFrame): The dataset to be partitioned.
    - config (dict): JSON configuration specifying the partitioning details.
    
    Returns:
    - dict: A dictionary containing the partitioned data.
    """
    # Extract configuration parameters
    policy = config.get("policy", "Split the dataset")
    time_variable = config.get("time_variable", None)
    sampling_method = config.get("sampling_method", "No sampling(whole data)")
    split = config.get("split", "Randomly")
    k_fold = config.get("k_fold", False)
    train_ratio = config.get("train_ratio", 0)
    random_seed = config.get("random_seed", None)
    
    # Partition data based on policy
    if policy == "Split the dataset":
        if sampling_method == "No sampling(whole data)":
            if split == "Randomly":
                if not k_fold:
                    if train_ratio > 0:
                        train_data, test_data = train_test_split(data, train_size=train_ratio, random_state=random_seed)
                    else:
                        raise ValueError("train_ratio must be greater than 0.")
                else:
                    pass  
            else:
                raise ValueError("Only random split is implemented currently.")
        else:
            raise ValueError("Sampling method other than 'No sampling(whole data)' is not implemented.")
    else:
        raise ValueError("Policy other than 'Split the dataset' is not implemented.")
    
    if k_fold:
        kf = KFold(n_splits=k_fold, shuffle=True, random_state=random_seed)
        fold_data = {}
        fold_index = 1
        for train_index, test_index in kf.split(data):
            fold_train_data, fold_test_data = data.iloc[train_index], data.iloc[test_index]
            fold_data[f"fold_{fold_index}"] = {"train": fold_train_data, "test": fold_test_data}
            fold_index += 1
        return fold_data
    else:
        return {"train": pd.DataFrame(train_data), "test": pd.DataFrame(test_data)}



def create_models_from_json(data):
    models = {}
    for key, value in data.items():
        if value["is_selected"] == True:
            if key == "RandomForestClassifier":
                model = RandomForestClassifier(n_estimators=value["max_trees"],
                                                min_samples_leaf=value["min_samples_per_leaf_min_value"],
                                                max_depth=value["max_depth"])
            elif key == "RandomForestRegressor":
                model = RandomForestRegressor(n_estimators=value["max_trees"],
                                                min_samples_leaf=value["min_samples_per_leaf_min_value"],
                                                max_depth=value["max_depth"])
            elif key == "GBTClassifier":
                model = GradientBoostingClassifier(n_estimators=value["fixed_number"],
                                                    max_depth=value["max_depth"])
            elif key == "GBTRegressor":
                model = GradientBoostingRegressor(n_estimators=value["fixed_number"],
                                                    max_depth=value["max_depth"])
            elif key == "LinearRegression":
                model = LinearRegression()
            elif key == "LogisticRegression":
                model = LogisticRegression()
            elif key == "RidgeRegression":
                model = Ridge()
            elif key == "LassoRegression":
                model = Lasso()
            elif key == "ElasticNetRegression":
                model = ElasticNet()
            elif key == "DecisionTreeRegressor":
                model = DecisionTreeRegressor(max_depth=value["max_depth"],
                                                min_samples_leaf=value["min_samples_per_leaf"][1])
            elif key == "DecisionTreeClassifier":
                model = DecisionTreeClassifier(max_depth=value["max_depth"],
                                                min_samples_leaf=value["min_samples_per_leaf"][1])
            elif key == "SVM":
                model = SVC()
            elif key == "SVR":
                model = SVR()
            elif key == "SGD":
                model = SGDRegressor(alpha=value["alpha_value"][0])
            elif key == "KNN":
                model = KNeighborsClassifier(n_neighbors=value["k_value"][0])
            elif key == "KNNRegressor":
                model = KNeighborsRegressor(n_neighbors=value["k_value"][0])
            elif key == "extra_random_trees":
                model = ExtraTreesRegressor(n_estimators=value["num_of_trees"][1],
                                            max_depth=value["max_depth"][1],
                                            min_samples_leaf=value["min_samples_per_leaf"][1])
            elif key == "neural_network":
                model = MLPClassifier(hidden_layer_sizes=value["hidden_layer_sizes"])
            elif key == "neural_network_regressor":
                model = MLPRegressor(hidden_layer_sizes=value["hidden_layer_sizes"])
            elif key == "xg_boost":
                model = xgb.XGBRegressor(n_estimators=value["max_num_of_trees"],
                                            max_depth=value["max_depth_of_tree"][1],
                                            learning_rate=value["learningRate"][0],
                                            reg_alpha=value["l1_regularization"][0],
                                            reg_lambda=value["l2_regularization"][0],
                                            gamma=value["gamma"][0],
                                            min_child_weight=value["min_child_weight"][0],
                                            subsample=value["sub_sample"][0],
                                            colsample_bytree=value["col_sample_by_tree"][0],
                                            early_stopping_rounds=value["early_stopping_rounds"])

            models[key] = model

            return models
def evaluate_models(models, metrics, X_train, y_train, X_test, y_test):
    results = {}

    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        # Predict on test data
        y_pred = model.predict(X_test)
        
        # Calculate each metric
        model_results = {}
        for metric_name, metric_func in metrics.items():
            model_results[metric_name] = metric_func(y_test, y_pred)
        
        # Store the results in the dictionary
        results[model_name] = model_results

    return results


def evaluate_models_with_metrics(models, X_train, y_train, X_test, y_test, metrics):
    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = None

        roc_auc = None
        f1_optimal_threshold = None
        lift_at_threshold = None

        if y_prob is not None:
            roc_auc = roc_auc_score(y_test, y_prob)

        if metrics["optomize_model_hyperparameters_for"] == "AUC" and roc_auc is not None:
            model_metrics = roc_auc
        elif metrics["optimize_threshold_for"] == "F1 Score":
            f1_optimal_threshold = 0
            max_f1 = 0
            for threshold in np.linspace(0.1, 0.9, 9):
                if y_prob:
                    y_pred_threshold = (y_prob >= threshold).astype(int)
                    f1 = f1_score(y_test, y_pred_threshold)
                    if f1 > max_f1:
                        max_f1 = f1
                        f1_optimal_threshold = threshold
            model_metrics = max_f1
        else:
            model_metrics = None

        if metrics["compute_lift_at"] > 0 and y_prob is not None:
            conf_matrix = confusion_matrix(y_test, y_prob >= f1_optimal_threshold)
            true_positives = conf_matrix[1, 1]
            lift_at_threshold = true_positives / (y_test.sum() / len(y_test))

        results[name] = {
            "roc_auc": roc_auc,
            "f1_optimal_threshold": f1_optimal_threshold,
            "lift_at_threshold": lift_at_threshold
        }

    return results



def apply_weighting_strategy(dataset, config):
    if config["weighting_strategy_method"] == "Sample weights":
        weight_variable = config["weighting_strategy_weight_variable"]
        if weight_variable in dataset.columns:
            sample_weights = dataset[weight_variable]
            return sample_weights
        else:
            print(f"Weight variable '{weight_variable}' not found in dataset. Weighting strategy not applied.")
            return None
    else:
        print("Unsupported weighting strategy method. Please choose 'Sample weights'.")
        return None

def apply_probability_calibration(X_train, y_train, X_test, config):
    if config["probability_calibration_method"] == "Sigmoid - Platt Scaling":
        calibrated_clf = CalibratedClassifierCV(method='sigmoid', cv='prefit')
        calibrated_clf.fit(X_train, y_train)
        calibrated_proba = calibrated_clf.predict_proba(X_test)
        return calibrated_proba
    else:
        print("Unsupported probability calibration method. Please choose 'Sigmoid - Platt Scaling'.")
        return None



def optimize_models(models, X_train, y_train, hyperparameters):
    optimized_models = {}

    for name, model in models.items():
        if hyperparameters["stratergy"] == "Grid Search":
            param_grid = {}  # Define hyperparameters grid here
            if name == "RandomForestClassifier":
                param_grid = {
                    "n_estimators": [10, 20, 30],
                    "max_depth": [None, 10, 20]
                    # Add other hyperparameters specific to RandomForestClassifier
                }
            elif name == "RandomForestRegressor":
                param_grid = {
                    "n_estimators": [10, 20, 30],
                    "max_depth": [None, 10, 20]
                    # Add other hyperparameters specific to RandomForestRegressor
                }
            # Add similar elif blocks for other models

            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=hyperparameters["num_of_folds"],
                                       scoring='roc_auc', n_jobs=hyperparameters["parallelism"])
            grid_search.fit(X_train, y_train)

            optimized_model = grid_search.best_estimator_
            optimized_models[name] = optimized_model

    return optimized_models




In [28]:
# Load dataset
dataset_path = input_data['design_state_data']['session_info']['dataset']
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)
else:
    raise FileExistsError("The Specified CSV not found")

all_features_to_handle = input_data['design_state_data']['feature_handling']
target_column = input_data['design_state_data']['target']['target']
features_to_generate = input_data['design_state_data']['feature_generation']
feature_reduction_json  = input_data['design_state_data']['feature_reduction']
model_config = input_data['design_state_data']["algorithms"]
training_config  = input_data['design_state_data']['train']
metrics = input_data['design_state_data']["metrics"]
hyperparameters = input_data['design_state_data']["hyperparameters"]

X = data
y = data[target_column]

X = iterate_features_and_handle(all_features_to_handle, X)

X = generate_features(X,features_to_generate )

X = reduce_features(X, feature_reduction_json, target_column)

X = partition_data(X, training_config)

x_train = X['train'].drop(columns=[target_column])
y_train = X['train'][target_column]
x_test = X['test'].drop(columns=[target_column])
y_test = X['train'][target_column]

models = create_models_from_json(model_config)
optimized_models = optimize_models(models, x_train, y_train, hyperparameters)


# Skipping evaluation since all classification metrics are given and this is regression problem





models {'RandomForestRegressor': RandomForestRegressor(min_samples_leaf=5, n_estimators=10)}
metrics {'optomize_model_hyperparameters_for': 'AUC', 'optimize_threshold_for': 'F1 Score', 'compute_lift_at': 0, 'cost_matrix_gain_for_true_prediction_true_result': 1, 'cost_matrix_gain_for_true_prediction_false_result': 0, 'cost_matrix_gain_for_false_prediction_true_result': 0, 'cost_matrix_gain_for_false_prediction_false_result': 0}
x_train      poly_petal_length_sepal_width_0  petal_width_sepal_length  \
137                              5.5                     11.52   
84                               4.5                      8.10   
27                               1.5                      1.04   
127                              4.9                     10.98   
132                              5.6                     14.08   
..                               ...                       ...   
9                                1.5                      0.49   
103                              

TypeError: 'str' object is not callable