In [179]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, StratifiedKFold, KFold
from sklearn.metrics import auc, f1_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from xgboost import XGBClassifier, XGBRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

# Load the JSON configuration
with open("/content/json-fixer.json") as f:
    config = json.load(f)


def extract_session_info(config):
    session_info = config['design_state_data']['session_info']
    project_id = session_info['project_id']
    experiment_id = session_info['experiment_id']
    dataset = session_info['dataset']
    session_name = session_info['session_name']
    session_description = session_info['session_description']

    return project_id, experiment_id, dataset, session_name, session_description



# data is the dataset containing the features and target variable
#data = pd.read_csv('/content/iris.csv')
# You can replace 'data' with your actual dataset


#target
prediction_type = config["design_state_data" ]['target']['prediction_type'],
target_column = config["design_state_data" ]['target']['target'],
X = data.drop(target_column, axis=1)
y = data[target_column]
partitioning = config["design_state_data" ]['target']['partitioning']


#train policy

train_policy = config['train']['policy'],
variable = config['train']['time_variable'],
sampling_method = config['train']['sampling_method'],
split_method = config['train']['split']
k_fold = config['train']['k_fold']
test_size=config['train']['train_ratio'],
random_state=config['train']['random_seed'],


# Train-test split
#if config['train']['split'] == 'Randomly':
#    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['train']['train_ratio'], random_state=config['train']['random_seed'])
#else:
    # Implement other partitioning strategies if needed
#    pass

#Evaluation metrices

def optimize_metrics(config, y_true, y_pred):
    # Extract metrics configuration details
    opt_metric = config['optomize_model_hyperparameters_for']
    opt_threshold_metric = config['optimize_threshold_for']
    compute_lift_at = config['compute_lift_at']

    # Compute the specified metrics
    if opt_metric == 'AUC':
        # Implement AUC calculation based on y_true and y_pred
        # auc_score = calculate_auc(y_true, y_pred)
        pass
    elif opt_metric == 'Accuracy':
        # Implement accuracy calculation based on y_true and y_pred
        # accuracy = calculate_accuracy(y_true, y_pred)
        pass
    # Add more metrics as needed

    if opt_threshold_metric == 'F1 Score':
        # Optimize the threshold for F1 Score and update y_pred accordingly
        # f1_optimized_y_pred = optimize_f1_score_threshold(y_true, y_pred)
        # y_pred = f1_optimized_y_pred
        pass
    elif opt_threshold_metric == 'Other Metric':
        # Implement other threshold optimization based on the chosen metric
        pass
    # Add more threshold optimization methods as needed

    if compute_lift_at > 0:
        # Compute lift at the specified percentile (compute_lift_at)
        # lift_score = compute_lift(y_true, y_pred, compute_lift_at)
        pass
    # Add more lift computation or other metric-specific calculations as needed

    # Compute cost matrix or other metric-specific evaluations based on the gains
    cost_matrix_gain_for_true_pred_true_result = config['cost_matrix_gain_for_true_prediction_true_result']
    cost_matrix_gain_for_true_pred_false_result = config['cost_matrix_gain_for_true_prediction_false_result']
    cost_matrix_gain_for_false_pred_true_result = config['cost_matrix_gain_for_false_prediction_true_result']
    cost_matrix_gain_for_false_pred_false_result = config['cost_matrix_gain_for_false_prediction_false_result']

    # Implement the cost matrix calculation or other metric-specific evaluations based on the gains
    # cost_matrix = calculate_cost_matrix(y_true, y_pred, cost_matrix_gain_for_true_pred_true_result, ...)

    # Return the computed metrics or any other relevant information
    # return auc_score, accuracy, lift_score, cost_matrix

# Example usage:
# Assuming 'config' is the configuration dictionary for metrics as mentioned above
# Let's say 'y_true' is the true labels and 'y_pred' is the predicted labels from the model
# You can compute the metrics using the function like this:

# metrics_results = optimize_metrics(config, y_true, y_pred)
# Now, the 'metrics_results' will contain the computed metrics or any other relevant information.



#feature handling

def handle_feature(config, data):
    # Extract feature name and details from the configuration
    feature_name = config['feature_name']
    is_selected = config['is_selected']
    feature_variable_type = config['feature_variable_type']

    if not is_selected:
        # If the feature is not selected, simply drop it from the data
        data.drop(feature_name, axis=1, inplace=True)
        return data

    if feature_variable_type == 'numerical':
        # Numerical feature handling
        numerical_handling = config['feature_details']['numerical_handling']

        if numerical_handling == 'Keep as regular numerical feature':
            # No special handling required
            pass
        elif numerical_handling == 'Rescale':
            # Perform rescaling (e.g., Min-Max scaling, Standardization)
            # Implement rescaling here based on the chosen method
            pass

        # Handle missing values
        missing_values = config['feature_details']['missing_values']
        if missing_values == 'Impute':
            impute_with = config['feature_details']['impute_with']
            impute_value = config['feature_details']['impute_value']
            if impute_with == 'Average of values':
                # Impute missing values with the average of non-missing values
                data[feature_name].fillna(data[feature_name].mean(), inplace=True)
            elif impute_with == 'custom':
                # Impute missing values with a custom value
                data[feature_name].fillna(impute_value, inplace=True)

    elif feature_variable_type == 'text':
        # Text feature handling
        text_handling = config['feature_details']['text_handling']
        if text_handling == 'Tokenize and hash':
            # Tokenize the text and apply hashing (e.g., feature hashing)
            # Implement tokenization and hashing here based on the chosen method
            pass

    # If 'make_derived_feats' is True, create derived features based on the original feature.
    make_derived_feats = config['feature_details']['make_derived_feats']
    if make_derived_feats:
        # Implement derived feature creation based on the original feature
        pass

    return data

# Example usage:
# Assuming 'data' is the dataset containing the features mentioned in the configuration
# Let's say 'config' is the configuration dictionary for feature handling as mentioned above
# You can handle each feature using the function like this:

for feature_name, config in config.items():
    data = handle_feature(config, data)

# Now, the 'data' will be modified according to the feature handling operations specified in the configuration.




# Feature generation

def generate_features(config, data):
    # Extract feature generation configuration details
    linear_interactions = config['linear_interactions']
    linear_scalar_type = config['linear_scalar_type']
    polynomial_interactions = config['polynomial_interactions']
    explicit_pairwise_interactions = config['explicit_pairwise_interactions']

    # Perform linear interactions
    for interaction in linear_interactions:
        feature_name = f"{interaction[0]}*{interaction[1]}"
        data[feature_name] = data[interaction[0]] * data[interaction[1]]

    # Perform polynomial interactions
    for interaction in polynomial_interactions:
        interaction_features = interaction.split('/')
        poly = PolynomialFeatures(degree=2, include_bias=False)
        poly_features = poly.fit_transform(data[interaction_features])
        poly_feature_names = poly.get_feature_names(interaction_features)
        poly_feature_df = pd.DataFrame(poly_features, columns=poly_feature_names)
        data = pd.concat([data, poly_feature_df], axis=1)

    # Perform explicit pairwise interactions
    for interaction in explicit_pairwise_interactions:
        interaction_features = interaction.split('/')
        feature_name = f"{interaction_features[0]}_{interaction_features[1]}"
        data[feature_name] = data[interaction_features[0]] + data[interaction_features[1]]

    # Perform linear scaling if required (e.g., RobustScaler)
    if linear_scalar_type == 'robust':
        scaler = RobustScaler()
        scaled_features = scaler.fit_transform(data)
        data = pd.DataFrame(scaled_features, columns=data.columns)

    return data

# Example usage:
# Assuming 'data' is the dataset containing the original features as mentioned in the configuration
# Let's say 'config' is the configuration dictionary for feature generation as mentioned above
# You can generate the new features using the function like this:

# data = generate_features(config['feature_generation'], data)
# Now, the 'data' will contain the original features along with the generated linear, polynomial, and explicit pairwise interactions.
# Additionally, if specified, the data will be linearly scaled using the RobustScaler.

#feature reduction

def perform_feature_reduction(config, X_train, y_train):
    # Extract feature reduction configuration details
    feature_reduction_method = config['feature_reduction_method']
    num_of_features_to_keep = int(config['num_of_features_to_keep'])
    num_of_trees = int(config['num_of_trees'])
    depth_of_trees = int(config['depth_of_trees'])

    # Initialize the feature reduction model
    if feature_reduction_method == 'Tree-based':
        model = RandomForestClassifier(n_estimators=num_of_trees, max_depth=depth_of_trees)
    elif feature_reduction_method == 'Tree-based Regression':
        model = RandomForestRegressor(n_estimators=num_of_trees, max_depth=depth_of_trees)
    else:
        # Implement other feature reduction methods if needed
        return X_train

#hyperparameters

def optimize_hyperparameters(config, model, X_train, y_train):
    # Extract hyperparameter configuration details
    hyperparameter_strategy = config['stratergy']
    shuffle_grid = config['shuffle_grid']
    random_state = config['random_state']
    max_iterations = config['max_iterations']
    max_search_time = config['max_search_time']
    parallelism = config['parallelism']
    cross_validation_strategy = config['cross_validation_stratergy']
    num_of_folds = config['num_of_folds']
    split_ratio = config['split_ratio']
    stratified = config['stratified']

    # Create a dictionary of hyperparameters for the grid search
    hyperparameter_grid = {
        # Add hyperparameter options for grid search
        # Example: 'param_name': [value1, value2, ...]
    }

    # Create a cross-validation strategy based on the specified configuration
    if cross_validation_strategy == 'Time-based K-fold(with overlap)':
        cv = TimeSeriesSplit(n_splits=num_of_folds)
    elif cross_validation_strategy == 'Stratified K-fold':
        if stratified:
            cv = StratifiedKFold(n_splits=num_of_folds, shuffle=True, random_state=random_state)
        else:
            cv = KFold(n_splits=num_of_folds, shuffle=True, random_state=random_state)
    else:
        # Implement other cross-validation strategies if needed
        cv = None

    # Initialize the GridSearchCV with the model and hyperparameter grid
    grid_search = GridSearchCV(model, hyperparameter_grid, cv=cv, n_jobs=parallelism, verbose=2)

    # Perform grid search to find the best hyperparameters
    grid_search.fit(X_train, y_train)

    # Get the best model with optimized hyperparameters
    best_model = grid_search.best_estimator_

    return best_model

# Example usage:
# Assuming 'model' is the base model, 'X_train' is the training feature data, and 'y_train' is the corresponding target data
# Let's say 'config' is the configuration dictionary for hyperparameters as mentioned above
# You can perform hyperparameter optimization using the function like this:

# best_model = optimize_hyperparameters(config['hyperparameters'], model, X_train, y_train)
# Now, the 'best_model' will be the model with the optimized hyperparameters based on the grid search.


# Weight stratagy

def apply_weighting_strategy(config, data):
    # Extract weighting strategy configuration details
    weighting_strategy_method = config['weighting_stratergy_method']
    weighting_strategy_weight_variable = config['weighting_stratergy_weight_variable']

    if weighting_strategy_method == 'Sample weights':
        # Implement sample weighting strategy based on the specified weight variable
        # Calculate the weights based on the 'weighting_strategy_weight_variable'
        # For example, if 'petal_length' is the weight variable, you can calculate the weights as follows:
        # data['weights'] = calculate_weights_based_on_petal_length(data[weighting_strategy_weight_variable])

        # Ensure that the 'weights' column is available in the dataset before using it in the model
        # For example, you may have to modify your model training code to include the 'weights' parameter:
        # model.fit(X_train, y_train, sample_weight=data['weights'])
        pass
    elif weighting_strategy_method == 'Other Weighting Method':
        # Implement other weighting strategies based on the chosen method
        pass
    # Add more weighting strategies as needed

    return data

# Example usage:
# Assuming 'data' is the dataset containing the features and target variable as mentioned in the configuration
# Let's say 'config' is the configuration dictionary for the weighting strategy as mentioned above
# You can apply the weighting strategy using the function like this:

# data = apply_weighting_strategy(config['weighting_stratergy'], data)
# Now, the 'data' will contain the weights based on the specified weighting strategy.
# Make sure to use these weights appropriately in your model training code.

# Probability calibration


def perform_probability_calibration(config, model, X_train, y_train):
    # Extract probability calibration configuration details
    calibration_method = config['probability_calibration_method']

    # Initialize the CalibratedClassifierCV with the model and calibration method
    if calibration_method == 'Sigmoid - Platt Scaling':
        calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
    elif calibration_method == 'Isotonic':
        calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv='prefit')
    else:
        # Implement other probability calibration methods if needed
        return model

    # Fit the calibrated model on the training data
    calibrated_model.fit(X_train, y_train)

    return calibrated_model

# Example usage:
# Assuming 'model' is the base model, 'X_train' is the training feature data, and 'y_train' is the corresponding target data
# Let's say 'config' is the configuration dictionary for probability calibration as mentioned above
# You can perform probability calibration using the function like this:

# calibrated_model = perform_probability_calibration(config['probability_calibration'], model, X_train, y_train)
# Now, the 'calibrated_model' will be the model with probability calibration based on the chosen method.

# Model



def train_and_evaluate_model(config, X_train, y_train, X_test, y_test):
    model_name = config['model_name']
    if not config['is_selected']:
        print(f"Skipping {model_name} as it is not selected.")
        return

    if model_name == 'Random Forest Classifier':
        model = RandomForestClassifier(n_estimators=config['max_trees'],
                                       min_samples_leaf=config['min_samples_per_leaf_min_value'],
                                       max_depth=config['max_depth'],
                                       random_state=0)
    elif model_name == 'Random Forest Regressor':
        model = RandomForestRegressor(n_estimators=config['max_trees'],
                                      min_samples_leaf=config['min_samples_per_leaf_min_value'],
                                      max_depth=config['max_depth'],
                                      random_state=0)
    elif model_name == 'Gradient Boosted Trees':
        model = GradientBoostingClassifier(n_estimators=config['num_of_BoostingStages'][0],
                                           max_depth=config['max_depth'])
    elif model_name == 'Gradient Boosted Trees Regressor':
        model = GradientBoostingRegressor(n_estimators=config['num_of_BoostingStages'][0],
                                          max_depth=config['max_depth'])
    elif model_name == 'LinearRegression':
        model = LinearRegression()
    elif model_name == 'LogisticRegression':
        model = LogisticRegression()
    elif model_name == 'RidgeRegression':
        model = Ridge()
    elif model_name == 'Lasso Regression':
        model = Lasso()
    elif model_name == 'ElasticNetRegression':
        model = ElasticNet()
    elif model_name == 'XG Boost':
        model = XGBClassifier(n_estimators=config['max_num_of_trees'],
                              max_depth=config['max_depth_of_tree'][0],
                              learning_rate=config['learningRate'][0],
                              reg_alpha=config['l1_regularization'][0],
                              reg_lambda=config['l2_regularization'][0],
                              gamma=config['gamma'][0],
                              subsample=config['sub_sample'][0],
                              colsample_bytree=config['col_sample_by_tree'][0],
                              random_state=0)
    elif model_name == 'Decision Tree':
        model = DecisionTreeClassifier(min_samples_leaf=config['min_samples_per_leaf'][0],
                                       max_depth=config['max_depth'],
                                       criterion='entropy' if config['use_entropy'] else 'gini')
    elif model_name == 'Support Vector Machine':
        model = SVC(C=config['c_value'][0], kernel='linear')
    elif model_name == 'Stochastic Gradient Descent':
        model = SGDClassifier(loss='log' if config['use_logistics'] else 'modified_huber',
                              alpha=config['alpha_value'][0],
                              penalty='l1' if config['use_l1_regularization'] else 'l2',
                              l1_ratio=config['use_elastic_net_regularization'],
                              random_state=config['random_state'])
    elif model_name == 'KNN':
        model = KNeighborsClassifier(n_neighbors=config['k_value'][0], weights='distance' if config['distance_weighting'] else 'uniform')
    elif model_name == 'Extra Random Trees':
        model = ExtraTreesRegressor(n_estimators=config['num_of_trees'][0],
                                    max_depth=config['max_depth'][0],
                                    min_samples_leaf=config['min_samples_per_leaf'][0])
    elif model_name == 'Neural Network':
        model = MLPClassifier(hidden_layer_sizes=config['hidden_layer_sizes'],
                              activation=config['activation'],
                              alpha=config['alpha_value'],
                              max_iter=config['max_iterations'],
                              tol=config['convergence_tolerance'],
                              solver=config['solver'],
                              shuffle=config['shuffle_data'],
                              learning_rate_init=config['initial_learning_rate'],
                              batch_size='auto' if config['automatic_batching'] else None,
                              beta_1=config['beta_1'],
                              beta_2=config['beta_2'],
                              epsilon=config['epsilon'],
                              power_t=config['power_t'],
                              momentum=config['momentum'],
                              nesterovs_momentum=config['use_nesterov_momentum'])
    else:
        print(f"Unknown model name: {model_name}")
        return

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Add evaluation metrics based on the problem type (classification/regression) if needed
    if config['target']['type'] == 'classification':
       accuracy = accuracy_score(y_test, y_pred)
       print(f"{model_name} - Accuracy:", accuracy)

    elif config['target']['type'] == 'regression':
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} - Accuracy:", accuracy)





In [3]:
config = open("/content/json-fixer.json", 'r', encoding='utf-8')
config_data = json.load(config)
config_data

{'session_name': 'test',
 'session_description': 'test',
 'design_state_data': {'session_info': {'project_id': '1',
   'experiment_id': 'kkkk-11',
   'dataset': 'iris_modified.csv',
   'session_name': 'test',
   'session_description': 'test'},
  'target': {'prediction_type': 'Regression',
   'target': 'petal_width',
   'type': 'regression',
   'partitioning': True},
  'train': {'policy': 'Split the dataset',
   'time_variable': 'sepal_length',
   'sampling_method': 'No sampling(whole data)',
   'split': 'Randomly',
   'k_fold': False,
   'train_ratio': 0,
   'random_seed': 0},
  'metrics': {'optomize_model_hyperparameters_for': 'AUC',
   'optimize_threshold_for': 'F1 Score',
   'compute_lift_at': 0,
   'cost_matrix_gain_for_true_prediction_true_result': 1,
   'cost_matrix_gain_for_true_prediction_false_result': 0,
   'cost_matrix_gain_for_false_prediction_true_result': 0,
   'cost_matrix_gain_for_false_prediction_false_result': 0},
  'feature_handling': {'sepal_length': {'feature_name'

In [None]:
df=pd.read_csv('iris.csv')
df

In [None]:
df.describe()

In [None]:
df.isnull().count()

In [None]:
df.info()

In [None]:
a=df['species'].unique()
a

In [None]:
df['species'].value_counts()

In [None]:
sns.scatterplot(df, x= 'petal_width', y='sepal_width', hue='species')

In [None]:
sns.scatterplot(df, x= 'petal_length', y='sepal_length', hue='species')

In [None]:
corr_m = df.corr()
corr_m['sepal_width'].sort_values(ascending=False)

In [None]:

plt.subplots(figsize=(6, 4))
sns.heatmap(corr_m, annot = True)
plt.title("Correlations Heat Map")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_ds, test_ds = train_test_split(df, test_size=0.25)

In [None]:
train_ds.shape, test_ds.shape

In [None]:
input_col, target_col = train_ds.columns[:-1], train_ds.columns[-1]
input_col, target_col

In [None]:
train_input = train_ds[input_col]
train_tar = train_ds[target_col]
test_input = test_ds[input_col]
test_tar = test_ds[target_col]

In [None]:
train_input.shape, test_input.shape

In [None]:
train_input.head()

In [None]:
train_tar.head()

In [None]:
test_input.head()

In [None]:
test_tar.head()

In [None]:
from xgboost import XGBClassifier

In [None]:
X = XGBClassifier()
X.fit(train_input, train_ds.species)

In [None]:
j_file=json.loads("algoparams_from_ui.json")
j_file