In [16]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from striprtf.striprtf import rtf_to_text
import pandas as pd
import json


# Function to load JSON data from an RTF file
def load_json_from_rtf(file_path):
    # Read the RTF file and extract plain text
    with open(file_path, 'r') as file:
        rtf_content = file.read()

    plain_text = rtf_to_text(rtf_content)

    # Load the extracted text as JSON data
    json_data = json.loads(plain_text)
    return json_data


# Function to load a CSV file as a Pandas DataFrame
def load_csv_as_dataframe(file_path):
    data = pd.read_csv(file_path)
    return data


# Function to create a feature handling pipeline
def create_feature_handling_pipeline(feature_handling):
    steps = []
    
    # Iterate through each feature to handle missing values
    for feature, details in feature_handling.items():
        feature_details = details.get('feature_details')
        if feature_details:
            impute_with = feature_details.get('impute_with')
            impute_value = feature_details.get('impute_value')
            if impute_with and impute_value:
                if impute_with == 'Impute':
                    if impute_value == 'Average of values':
                        imputer = SimpleImputer(strategy='mean')
                        steps.append((f'imputer_{feature}', imputer))
                    elif impute_value == 'custom':
                        imputer = SimpleImputer(strategy='constant', fill_value=feature_details['custom_value'])
                        steps.append((f'imputer_{feature}', imputer))
                    # Add more conditions for other imputation methods
    
    if not steps:
        # If no steps were appended, add a passthrough step
        steps.append(('passthrough', 'passthrough'))
    
    feature_handling_pipeline = Pipeline(steps)
    return feature_handling_pipeline


# Function to create the feature reduction pipeline
def create_feature_reduction_pipeline(json_data, df):
    # Label encoding for object type columns
    for feature in df.columns:
        if df[feature].dtype == 'object':
            label_encoder = LabelEncoder()
            df[feature] = label_encoder.fit_transform(df[feature])

    feature_reduction = json_data['design_state_data']['feature_reduction']
    method = feature_reduction.get('feature_reduction_method')
    target = json_data['design_state_data']['target']['target']
    X = df.drop(columns=[target]) if target in df.columns else df.copy()

    steps = []

    if method == 'No Reduction':
        return None, X, df[target]  # No reduction to be performed

    if method == 'Corr with Target':
        if target in df.columns:
            corr_values = df.corr()[target].sort_values(ascending=False)
            num_features_to_select = 5
            selected_features = corr_values.index[1:num_features_to_select + 1]
            if target in df.columns:
                steps.append(('feature_selection', SelectKBest(score_func=f_regression, k=num_features_to_select)))

    if method == 'Tree-based':
        if target in df.columns:
            model = RandomForestRegressor()
            model.fit(X, df[target])
            feature_importance = model.feature_importances_
            num_features_to_select = 5
            steps.append(('feature_selection', SelectKBest(k=num_features_to_select)))

    if method == 'PCA':
        if len(X.columns) > 0:  # Check if there are columns to perform PCA
            n_components = min(3, X.shape[1])  # Adjust the number of components if needed
            pca = PCA(n_components=n_components)
            steps.append(('pca', pca))

    feature_reduction_pipeline = Pipeline(steps)
    return feature_reduction_pipeline, X, df[target]



# Function to create the model fitting pipeline
def create_model_fitting_pipeline(json_data, df, X, y):
    # Extract the split parameters
    train_params = json_data['design_state_data']['train']
    train_ratio = train_params['train_ratio']
    random_seed = train_params['random_seed']
    
    # Extracting the target prediction type and the algorithms section
    prediction_type = json_data['design_state_data']['target']['prediction_type']
    algorithms = json_data['design_state_data']['algorithms']

    # Define model parameters based on the algorithm configuration in the JSON
    model_parameters = {
        "RandomForestClassifier": algorithms.get("RandomForestClassifier", {}),
        "RandomForestRegressor": algorithms.get("RandomForestRegressor", {}),
        "GBTClassifier": algorithms.get("GBTClassifier", {}),
        "GBTRegressor": algorithms.get("GBTRegressor", {}),
        "LinearRegression": algorithms.get("LinearRegression", {}),
        "LogisticRegression": algorithms.get("LogisticRegression", {}),
        "RidgeRegression": algorithms.get("RidgeRegression", {}),
        "LassoRegression": algorithms.get("LassoRegression", {}),
        "ElasticNetRegression": algorithms.get("ElasticNetRegression", {}),
        "xg_boost": algorithms.get("xg_boost", {}),
        "DecisionTreeRegressor": algorithms.get("DecisionTreeRegressor", {}),
        "DecisionTreeClassifier": algorithms.get("DecisionTreeClassifier", {}),
        "SVM": algorithms.get("SVM", {}),
        "SGD": algorithms.get("SGD", {}),
        "KNN": algorithms.get("KNN", {}),
        "extra_random_trees": algorithms.get("extra_random_trees", {}),
        "neural_network": algorithms.get("neural_network", {})
    }

    # Predefined model objects for regression and classification
    model_objects = {
        "Regression": {
            "RandomForestRegressor": RandomForestRegressor(),
            "GBTRegressor": XGBRegressor(),
            "LinearRegression": LinearRegression(),
            "LassoRegression": Lasso(),
            "RidgeRegression": Ridge(),
            "ElasticNetRegression": ElasticNet(),
            "DecisionTreeRegressor": DecisionTreeRegressor(),
            "SVR": SVR(),
            "SGD": SGDRegressor(),
            "KNN": KNeighborsRegressor(),
            "extra_random_trees": RandomForestRegressor(),  # Change this with specific model object if available
            "neural_network": MLPRegressor()
        },
        "Classification": {
            "RandomForestClassifier": RandomForestClassifier(),
            "GBTClassifier": XGBClassifier(),
            "LogisticRegression": LogisticRegression(),
            "DecisionTreeClassifier": DecisionTreeClassifier(),
            "SVM": SVC(),
            "SGD": SGDClassifier(),
            "KNN": KNeighborsClassifier(),
            "neural_network": MLPClassifier()
        }
    }
    # Selecting model objects based on the prediction type
    selected_models = model_objects[prediction_type]
    selected_models_to_tune = []
    
    for model_name, model_instance in selected_models.items():
        # Ensure 'is_selected' is set to True for the model in the JSON
        if model_name in model_parameters and model_parameters[model_name].get("is_selected", False):
            model_params = model_parameters[model_name]

            # Create the model instance based on the model_name
            model_instance = selected_models[model_name]  # Ensure this line is used

            # Split the data for training and testing
            if train_ratio == 0:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
            else:
                X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio, random_state=random_seed)

            # Add selected models to tune with GridSearchCV
            selected_models_to_tune.append((model_name, model_instance, X_train, y_train, X_test, y_test))
    
     # Define default parameter grids for hyperparameter tuning
    default_param_grids = {
        'RandomForestRegressor': {
            'n_estimators': [100, 300, 500],
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
            # Add other RandomForestRegressor hyperparameters here
        },
        'GBTRegressor': {
            'n_estimators': [100, 300, 500],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01]
        # Add other XGBRegressor hyperparameters here
    },
    'LinearRegression': {
        'fit_intercept': [True, False],
        'normalize': [True, False]
        # Add other LinearRegression hyperparameters here
    },
    'LassoRegression': {
        'alpha': [0.1, 1.0, 10.0]
        # Add other LassoRegression hyperparameters here
    },
    'RidgeRegression': {
        'alpha': [0.1, 1.0, 10.0]
        # Add other RidgeRegression hyperparameters here
    },
    'ElasticNetRegression': {
        'alpha': [0.1, 1.0, 10.0],
        'l1_ratio': [0.1, 0.5, 0.9]
        # Add other ElasticNetRegression hyperparameters here
    },
    'DecisionTreeRegressor': {
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10]
        # Add other DecisionTreeRegressor hyperparameters here
    },
    'SVR': {
        'C': [1, 10, 100],
        'kernel': ['linear', 'rbf']
        # Add other SVR hyperparameters here
    },
    'SGD': {
        'loss': ['squared_loss', 'huber'],
        'alpha': [0.0001, 0.001, 0.01]
        # Add other SGDRegressor hyperparameters here
    },
    'KNN': {
        'n_neighbors': [3, 5, 10],
        'weights': ['uniform', 'distance']
        # Add other KNeighborsRegressor hyperparameters here
    },
    'extra_random_trees': {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10]
        # Add other RandomForestRegressor (Extra Trees) hyperparameters here
    },
    'neural_network': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh']
        # Add other MLPRegressor (Neural Network) hyperparameters here
        }
    }
    tuned_models = []
    
    for model_name, model_instance, X_train, y_train, X_test, y_test in selected_models_to_tune:
        model_name = type(model_instance).__name__
        if model_name in default_param_grids:
            param_grid = default_param_grids[model_name]

            # Use GridSearchCV to tune the models
            grid_search = GridSearchCV(model_instance, param_grid=param_grid, cv=5)
            grid_search.fit(X_train, y_train)

            # Collect the tuned model information
            tuned_models.append({
                "model_name": model_name,
                "best_estimator": grid_search.best_estimator_,
                "best_params": grid_search.best_params_,
                "best_score": grid_search.best_score_,
                "X_test": X_test,
                "y_test": y_test
            })

    return tuned_models
    
# Function to calculate model metrics
def calculate_model_metrics(tuned_models, prediction_type, X_test, y_test):
    for tuned_model in tuned_models:
        model_name = tuned_model["model_name"]
        best_estimator = tuned_model["best_estimator"]

        if prediction_type == "Regression":
            test_predictions = best_estimator.predict(X_test)
            mse = mean_squared_error(y_test, test_predictions)
            r2 = r2_score(y_test, test_predictions)

            print(f"Metrics for {model_name}:")
            print(f"MSE: {mse}")
            print(f"R-squared: {r2}")
        elif prediction_type == "Classification":
            test_predictions = best_estimator.predict(X_test)
            accuracy = accuracy_score(y_test, test_predictions)
            f1 = f1_score(y_test, test_predictions)

            print(f"Metrics for {model_name}:")
            print(f"Accuracy: {accuracy}")
            print(f"F1 Score: {f1}")






In [17]:
from sklearn.pipeline import Pipeline

def create_pipeline(json_file, csv_file):
    # Load data from the provided files
    json_data = load_json_from_rtf(json_file)  # Load and extract JSON data from the RTF file
    df = load_csv_as_dataframe(csv_file)  # Load the CSV file as a DataFrame

    pipeline_results = []  # Initialize an empty list to store model results

    # Data Preprocessing Pipeline
    feature_handling_pipeline = create_feature_handling_pipeline(json_data['design_state_data']['feature_handling'])  # Creating a data preprocessing pipeline
    X_features_handled = feature_handling_pipeline.fit_transform(df)  # Applying the preprocessing to the DataFrame
    target = json_data['design_state_data']['target']['target']  # Extracting the target variable for model fitting

    # Feature Reduction Pipeline
    feature_reduction_pipeline, X, y = create_feature_reduction_pipeline(json_data, df)  # Creating a feature reduction pipeline
    X_features_reduced = feature_reduction_pipeline.fit_transform(X_features_handled, y)  # Applying feature reduction to preprocessed data

    # Model Fitting Pipeline
    tuned_models = create_model_fitting_pipeline(json_data, df, X, y)  # Creating and fitting models to the data

    for tuned_model in tuned_models:
        model = tuned_model["best_estimator"]  # Extracting the best fitted model
        try:
            pipeline_results.append(model)  # Storing the best models in a list
        except Exception as e:
            print(f"An error occurred while fitting the model: {e}")  # Handling exceptions during model fitting

    # Extracting the target prediction type
    prediction_type = json_data['design_state_data']['target']['prediction_type']  # Extracting the type of prediction (Regression or Classification)

    # Assuming you have these variables available: tuned_models, prediction_type, X_test, y_test
    X_test = tuned_model["X_test"]  # Obtaining the test features for model evaluation
    y_test = tuned_model["y_test"]  # Obtaining the test target values for model evaluation

    calculate_model_metrics(tuned_models, prediction_type, X_test, y_test)  # Calculate and print model evaluation metrics
    return tuned_models  # Returning the list of tuned models

# Example usage:
pipeline_results = create_pipeline('algoparams_from_ui.json.rtf', 'iris_modified.csv')


  f = msb / msw


Metrics for RandomForestRegressor:
MSE: 0.03612550333192687
R-squared: 0.9247542109312084
