In [244]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_squared_error







## JSON Configuration

In [231]:
def load_json_config(file_path):
    """
    Load JSON configuration from the provided file.

    Parameters:
    - file_path (str): Path to the JSON file.

    Returns:
    - dict: Loaded JSON configuration.
    """
    try:
        with open(file_path, 'r') as f:
            config = json.load(f)
        return config
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file '{file_path}': {e}")
        return None

config=load_json_config('/Users/macbook/Documents/algoparams_from_ui1.json')
config


{'session_name': 'test',
 'session_description': 'test',
 'design_state_data': {'session_info': {'project_id': '1',
   'experiment_id': 'kkkk-11',
   'dataset': 'iris_modified.csv',
   'session_name': 'test',
   'session_description': 'test'},
  'target': {'prediction_type': 'Classification',
   'target': 'species',
   'type': 'classifiation',
   'partitioning': True},
  'train': {'policy': 'Split the dataset',
   'time_variable': 'sepal_length',
   'sampling_method': 'No sampling(whole data)',
   'split': 'Randomly',
   'k_fold': False,
   'train_ratio': 0.8,
   'random_seed': 10},
  'feature_handling': {'sepal_length': {'feature_name': 'sepal_length',
    'is_selected': True,
    'feature_variable_type': 'numerical',
    'feature_details': {'numerical_handling': 'Keep as regular numerical feature',
     'rescaling': 'No rescaling',
     'make_derived_feats': False,
     'missing_values': 'Impute',
     'impute_with': 'Average of values'}},
   'sepal_width': {'feature_name': 'sepal_wi

## Identify Data and Problem Type

In [232]:
def identify_data(json_config):
    """
    Identify data file, target variable, and problem type from JSON configuration.
    Create a DataFrame from the provided CSV file.

    Parameters:
    - json_config (dict): JSON configuration containing data information.

    Returns:
    - pd.DataFrame or None: DataFrame created from the CSV file, or None if there's an error.
    - str or None: Target variable name, or None if not found.
    - str or None: Problem type (classification or regression), or None if not found.
    """
    try:
        # Extract data file and target variable from JSON
        data_file = json_config['design_state_data']['session_info']['dataset']
        target_variable = json_config['design_state_data']['target']['target']
        
        # Create DataFrame from CSV file
        data = pd.read_csv(data_file)
        
        # Identify problem type
        if json_config['design_state_data']['target']['prediction_type'] == 'Classification':
            problem_type = 'classification'
        else:
            problem_type = 'regression'

        return data, target_variable, problem_type
    except KeyError as e:
        print(f"Error: Key not found in JSON configuration: {e}")
        return None, None, None
    except FileNotFoundError:
        print(f"Error: Data file '{data_file}' not found.")
        return None, None, None
    except Exception as e:
        print(f"Error: {e}")
        return None, None, None

data, target_variable, problem_type = identify_data(config)


In [233]:
data.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [234]:
target_variable

'species'

In [235]:
problem_type  

'classification'

## Feature Selection:

- Identify the features to be used based on the "is_selected" attribute.
- Create X (features) and y (target) from the DataFrame.

In [236]:
def create_X_Y_dataframe(datafile, config):
    # Step 1: Load the CSV file
    df = pd.read_csv(datafile)
    
    # Step 2: Parse the JSON configuration
    feature_handling = config.get("feature_handling", {})
    target = config.get("target", {})
    
    # Identify selected features and target variable
    selected_features = [feature for feature, details in feature_handling.items() if details.get("is_selected", False)]
    target_variable = target.get("target", None)
    
    # Step 3: Create X and Y DataFrames
    if target_variable is not None and selected_features:
        X_df = df[selected_features]
        Y_df = df[[target_variable]]
        return X_df, Y_df
    else:
        print("Target variable or selected features not found in the configuration.")
        return None, None
datafile = "iris_modified.csv"
config = {
    "feature_handling": {
        "sepal_length": {"is_selected": True},
        "sepal_width": {"is_selected": True},
        "petal_length": {"is_selected": True},
        "petal_width": {"is_selected": True},
        "species": {"is_selected": False }
    },
    "target": {
        "target": "species"
    }
}

X_df, Y_df = create_X_Y_dataframe(datafile, config)
X_df.head()


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [237]:
Y_df.head(2)

Unnamed: 0,species
0,Iris-setosa
1,Iris-setosa


## Data spliting

In [238]:
def split_data(X, Y, train_ratio, random_seed):
    """
    Split the data into training and validation sets based on the given split ratio and random seed value.

    Args:
    - X (DataFrame): The DataFrame containing the features.
    - Y (DataFrame): The DataFrame containing the target variable.
    - train_ratio (float): The ratio of the training set.
    - random_seed (int): Random seed value for reproducibility.

    Returns:
    - X_train (DataFrame): The training features.
    - X_val (DataFrame): The validation features.
    - Y_train (DataFrame): The training target variable.
    - Y_val (DataFrame): The validation target variable.
    """
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=train_ratio, random_state=random_seed)
    return X_train, X_val, Y_train, Y_val

X_train, X_val, Y_train, Y_val = split_data(X_df, Y_df, train_ratio=0.8, random_seed=10)



In [239]:
X_train.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
58,6.6,2.9,4.6,1.3
97,6.2,2.9,4.3,1.3


In [240]:
Y_train.head(2)

Unnamed: 0,species
58,Iris-versicolor
97,Iris-versicolor


## Handle missing values

In [243]:
def handle_missing_values(data, config):
    """
    Handle missing values in the DataFrame based on the provided configuration.

    Args:
    - data (DataFrame): The DataFrame containing the dataset.
    - config (dict): The configuration containing feature details.

    Returns:
    - DataFrame: The DataFrame with missing values handled according to the configuration.
    """
    features_config = config.get("feature_handling", {})
    for feature_name, feature_config in features_config.items():
        if feature_config.get("missing_values") == "Impute":
            impute_with = feature_config.get("feature_details", {}).get("impute_with", "Average of values")
            if impute_with == "Average of values":
                data[feature_name] = data[feature_name].fillna(data[feature_name].mean())
            # Add additional handling for other impute_with options if needed
    return data


## Encode features 

In [241]:
def encode_features(X_train, X_test, Y_train, Y_test, config):
    """
    Encode features and target variable in the training and testing data based on the provided configuration.

    Args:
    - X_train (DataFrame): The training data.
    - X_test (DataFrame): The testing data.
    - Y_train (DataFrame): The target variable of the training data.
    - Y_test (DataFrame): The target variable of the testing data.
    - config (dict): The configuration containing feature details.

    Returns:
    - DataFrame: The training data with features encoded according to the configuration.
    - DataFrame: The testing data with features encoded according to the configuration.
    - Series: The encoded target variable of the training data.
    - Series: The encoded target variable of the testing data.
    """
    features_config = config.get("feature_handling", {})
    ohe_columns = []
    oe_columns = []

    for feature_name, feature_config in features_config.items():
        feature_variable_type = feature_config.get("feature_variable_type")
        
        if feature_variable_type == "categorical" or feature_variable_type == "text":
            ohe_columns.append(feature_name)
        elif feature_variable_type == "ordinal":
            oe_columns.append(feature_name)

    # One-hot encoding for categorical and text features
    if ohe_columns:
        ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
        ohe.fit(X_train[ohe_columns])
        X_train_ohe = pd.DataFrame(ohe.transform(X_train[ohe_columns]), columns=ohe.get_feature_names(ohe_columns), index=X_train.index)
        X_test_ohe = pd.DataFrame(ohe.transform(X_test[ohe_columns]), columns=ohe.get_feature_names(ohe_columns), index=X_test.index)
        X_train = pd.concat([X_train.drop(ohe_columns, axis=1), X_train_ohe], axis=1)
        X_test = pd.concat([X_test.drop(ohe_columns, axis=1), X_test_ohe], axis=1)

    # Ordinal encoding for ordinal features
    if oe_columns:
        oe = OrdinalEncoder()
        oe.fit(X_train[oe_columns])
        X_train[oe_columns] = oe.transform(X_train[oe_columns])
        X_test[oe_columns] = oe.transform(X_test[oe_columns])

    # Label encoding for remaining categorical features
    le = LabelEncoder()
    for col in X_train.select_dtypes(include=['object']).columns:
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

    # Encode target variable for training data
    problem_type = config.get("target", {}).get("prediction_type")
    if problem_type == "Classification":
        le_target = LabelEncoder()
        Y_train_encoded = le_target.fit_transform(Y_train)
    elif problem_type == "Regression":
        oe_target = OrdinalEncoder()
        Y_train_encoded = oe_target.fit_transform(Y_train.values.reshape(-1, 1))
    else:
        raise ValueError("Unsupported problem type in configuration:", problem_type)

    # Encode target variable for testing data
    if problem_type == "Classification":
        Y_test_encoded = le_target.transform(Y_test)
    elif problem_type == "Regression":
        Y_test_encoded = oe_target.transform(Y_test.values.reshape(-1, 1))

    return X_train, X_test, Y_train_encoded, Y_test_encoded


In [186]:
X_train_encoded, X_test_encoded, Y_train_encoded,Y_test_encoded= encode_features(X_train, X_val, Y_train,Y_val ,config)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [195]:
Y_train_encoded.head(2)

Unnamed: 0,encoded_target
0,1
1,1


In [196]:
Y_test_encoded.head(3)

Unnamed: 0,encoded_target
0,1
1,2
2,0


In [199]:
def rescale_features_after_tts(X_train, X_test, config):
    """
    Rescale numerical features in the training and testing data based on the provided configuration.

    Args:
    - X_train (DataFrame): The training data.
    - X_test (DataFrame): The testing data.
    - config (dict): The configuration containing feature rescaling details.

    Returns:
    - DataFrame: The training data with numerical features rescaled according to the configuration.
    - DataFrame: The testing data with numerical features rescaled according to the configuration.
    """
    for feature_name, feature_config in config.items():
        if feature_config.get('feature_variable_type') == 'numerical':
            rescaling_method = feature_config['feature_details'].get('rescaling')
            if rescaling_method == 'Standardization':
                scaler = StandardScaler()
            elif rescaling_method == 'Min-Max Scaling':
                scaler = MinMaxScaler()
            elif rescaling_method == 'No rescaling':
                continue
            else:
                raise ValueError(f"Unsupported rescaling method for feature '{feature_name}': {rescaling_method}")
            
            # Fit scaler on training data and transform both training and testing data
            scaler.fit(X_train[[feature_name]])
            X_train[feature_name] = scaler.transform(X_train[[feature_name]])
            X_test[feature_name] = scaler.transform(X_test[[feature_name]])
    
    return X_train, X_test

X_train_rescaled, X_test_rescaled = rescale_features_after_tts(X_train, X_val, config['feature_handling'])


## Model Building:

- Identify the selected algorithm from the JSON.
- Extract hyperparameters for the selected algorithm.
- Use GridSearchCV for hyperparameter tuning.
- Train the model on the training data.

In [224]:
def parse_json_and_build_models(json_data):
    selected_models = []
    for model_name, model_config in json_data['design_state_data']['algorithms'].items():
        if model_config.get('is_selected', False):
            # Extract hyperparameters from model configuration
            hyperparameters = {}
            for key, value in model_config.items():
                if key in ['n_estimators', 'max_depth',
                           'min_samples_per_leaf_min_value', 'min_samples_per_leaf_max_value',
                           'regularization_term', 'max_iter', 'min_regparam', 'max_regparam']:
                    hyperparameters[key] = value
            
            # Define parameter grid for GridSearchCV based on extracted hyperparameters
            param_grid = {}
            for key, value in hyperparameters.items():
                if isinstance(value, list):
                    param_grid[key] = value
                elif isinstance(value, int):
                    param_grid[key] = [value]
                elif isinstance(value, dict):
                    param_grid[key] = range(value['min'], value['max'] + 1)
            
            # Create model instance
            if model_name == 'RandomForestClassifier':
                model = RandomForestClassifier(random_state=42)
            elif model_name == 'DecisionTreeClassifier':
                model = DecisionTreeClassifier(random_state=42)
            elif model_name == 'LogisticRegression':
                model = LogisticRegression(random_state=42)
            
            # Append model name, model instance, and parameter grid to selected_models list
            selected_models.append((model_name, model, param_grid))
                
            
    return selected_models

# Load JSON data (replace this with your JSON data)
json_data = load_json_config('/Users/macbook/Documents/algoparams_from_ui1.json')

# Parse JSON and build selected models
selected_models = parse_json_and_build_models(json_data)

# Perform hyperparameter tuning using GridSearchCV
best_models = []
for model_name, model, param_grid in selected_models:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    if model_name == "LogisticRegression":
        grid_search.fit(X_train_rescaled, Y_train_encoded)
    else:
        grid_search.fit(X_train_rescaled, np.ravel(Y_train_encoded))
    best_models.append((model_name, grid_search.best_estimator_, grid_search.best_params_))




In [245]:
def evaluate_classification_model(model, X_test, y_test):
    # Predict the target values
    y_pred = model.predict(X_test)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Calculate classification report
    report = classification_report(y_test, y_pred)
    
    return cm, report

def evaluate_regression_model(model, X_test, y_test):
    # Predict the target values
    y_pred = model.predict(X_test)
    
    # Calculate R-squared
    r2 = r2_score(y_test, y_pred)
    
    # Calculate Adjusted R-squared
    n = X_test.shape[0]
    p = X_test.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return r2, adj_r2, rmse

# Evaluation for classification models
for model_name, model, _ in best_models:
    if model_name in ['RandomForestClassifier', 'DecisionTreeClassifier', 'LogisticRegression']:
        print(f"Evaluation for {model_name}:")
        cm, report = evaluate_classification_model(model, X_test_rescaled, Y_test_encoded)
        print("Confusion Matrix:")
        print(cm)
        print("Classification Report:")
        print(report)
        print()

# Evaluation for regression models
for model_name, model, _ in best_models:
    if model_name in ['RandomForestRegressor', 'DecisionTreeRegressor', 'LinearRegression']:
        print(f"Evaluation for {model_name}:")
        r2, adj_r2, rmse = evaluate_regression_model(model, X_test_rescaled, Y_test_encoded)
        print(f"R-squared: {r2}")
        print(f"Adjusted R-squared: {adj_r2}")
        print(f"RMSE: {rmse}")
        print()




## Model Evaluation:

- Evaluate the trained model using appropriate evaluation metrics based on the problem type (classification or regression).
- Print the evaluation results.

In [219]:
def print_classification_metrics(model, X_test, y_test):
    # Predict the target values
    y_pred = model.predict(X_test)
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Print classification metrics for each model
print("Random Forest Classifier:")
print_classification_metrics(rf_grid_search.best_estimator_, X_test_rescaled, Y_test_encoded)

print("\nDecision Tree Classifier:")
print_classification_metrics(dt_grid_search.best_estimator_, X_test_rescaled, Y_test_encoded)

print("\nLogistic Regression:")
print_classification_metrics(lr_grid_search.best_estimator_, X_test_rescaled, Y_test_encoded)

Random Forest Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         7

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


Decision Tree Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.92      0.85      0.88        13
           2       0.75      0.86      0.80         7

    accuracy                           0.90        30
   macro avg       0.89      0.90      0.89        30
weighted avg       0.91      0.90      0.90        30


Logistic Regression:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      