In [None]:
import os
import sys
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
import statistics
from joblib import Parallel, delayed
import warnings
from tqdm import tqdm
import shap




def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")

def calculate_significant_features_mi(df, target, threshold=0.7):
    significant_features = []

    X = df.drop(columns=[target])
    y = df[target]

    # Calculate mutual information using mutual_info_classif since the target is binary
    mi = mutual_info_classif(X, y)

    # Normalize MI values by the maximum MI value
    max_mi = np.max(mi)
    normalized_mi = mi / max_mi

    # Select features with normalized MI above the threshold
    for feature, norm_mi in zip(X.columns, normalized_mi):
        if norm_mi >= threshold:
            significant_features.append(feature)

    return significant_features
    

def aux_func(data, target_variable, only_normal=False, type_aux_mod="linear", selection_method=None, 
             frac_feature_imp_normal=0.1, new_cols_corr_thr=0.2,
             shap_treatment=False, sqrt_error=False, seed=123):
    
    #################################
    # PREPROCESSING
    data = data.drop_duplicates()
                            
    # Handling missing values (drop rows with missing values for simplicity)
    data.dropna(inplace=True)
    
    # Encoding categorical variables using one-hot encoding (OHE).
    # First convert the target into categorical. This facilitates later
    # calculations
    data[target_variable] = data[target_variable].astype("category")
    # Now apply OHE to categorical columns (including the target)
    data = pd.get_dummies(data)

    # After OHE there will be two columns created for the target (since it was binary).
    # Choose one of those columns as target and discard the other one. It is not important
    # which of the two is selected as target, since the F1-score performance is measured
    # using 'macro' average option
    aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
    target_variable = aux_names_target[0]
    data = data.drop(columns=[aux_names_target[1]])
                            
    # Normalizing numerical variables. Since previously OHE
    # was performed, all variables with more than 2 different
    # values will be numerical
    numerical_columns = []
    non_numerical_columns = []
    for aux_num in data.columns:
        unique_values_count = data[aux_num].nunique()
        if (unique_values_count > 2):
            numerical_columns.append(aux_num)
        else:
            non_numerical_columns.append(aux_num)
    # Normalization
    if (len(numerical_columns) != 0):
        scaler = StandardScaler()
        aux_data = scaler.fit_transform(data[numerical_columns])
        aux_data = pd.DataFrame(aux_data, columns=numerical_columns)
        aux_data = aux_data.reset_index(drop=True)
        data_non_numerical = data[non_numerical_columns].reset_index(drop=True)
        data = pd.concat([aux_data, data_non_numerical], axis=1)
                                                                                

    # Shuffle the DataFrame to randomize the rows
    data = data.sample(frac=1, random_state=seed)  
                            
    # Save some registers for testing performance:
    data_test = data.sample(frac=0.15, random_state=42)
    data = data.drop(data_test.index)

    # Calculate class weights
    class_counts = data[target_variable].value_counts()
    total_samples = class_counts.sum()
    class_weights = {cls: total_samples / count for cls, count in class_counts.items()}

    # Define custom scorer
    average_metric_type = 'macro'
    custom_scorer = make_scorer(f1_score, average=average_metric_type)

    #################################
    # NORMAL MODEL

    # If the function only returns the normal performance or the selection
    # method of new variables is feature_imp_normal
    if ((only_normal) | (selection_method is not None)):

        X_normal = data.drop(columns=[target_variable])
        y_normal = data[target_variable] 

        # Define RandomForestClassifier
        rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
        # Define the parameter grid for grid search
        min_samples_split_grid_search = [5, 10, 15]
        min_samples_leaf_grid_search = [5, 10]
        param_grid = {
            'min_samples_split': min_samples_split_grid_search,
            'min_samples_leaf' : min_samples_leaf_grid_search
        }
        
        # Perform grid search        
        grid_search = GridSearchCV(estimator=rf_normal, param_grid=param_grid,
                                   scoring=custom_scorer, cv=5, n_jobs=-1)
        # The normal model takes imbalance into account to generate 
        # maximum normal performance
        grid_search.fit(X_normal, y_normal, sample_weight=y_normal.map(class_weights))
        
        # Get the best normal model
        rf_normal = grid_search.best_estimator_

        # If the call to the function was to just calculate the
        # normal performance
        if (only_normal):
            features_test_normal = data_test.drop(target_variable, axis=1)
            target_test_normal = data_test[target_variable]
            predictions_normal = rf_normal.predict(features_test_normal)
            normal_score = f1_score(target_test_normal, predictions_normal, average=average_metric_type)
                           
            return (normal_score)
    
    
    #######################################
        
                                                
    #################################
    # GENERATION OF AUXILIARY MODELS
        
    # List of dictionaries
    list_of_dictionaries = []
                            
    # For each value of the target
    for target_value in sorted(list(data[target_variable].unique())):
                                
        # Generate auxiliary dataset
        dataset_aux = data[data[target_variable] == target_value]
                                
        # Discard target in auxiliary dataset
        dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
        # Generate dictionary of ficticious targets and the models that predict them:
        dictionary_aux = {}
                                
        for fict_target in dataset_aux.columns.tolist():
        
            # Train auxiliary model and save it
            X = dataset_aux.drop(columns=[fict_target])
            y = dataset_aux[fict_target]                                 

            if (type_aux_mod == "linear"):
                aux_model = LinearRegression(n_jobs=-1)
            else:
                if (type_aux_mod == "randomforest"):
                    aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5, n_jobs=-1)
                else:
                    print("Error: current allowed type for auxiliary models is 'linear' or 'randomforest'")
                    sys.exit()
            
            aux_model.fit(X, y)
                                        
            dictionary_aux[fict_target] = aux_model
                                        
                                    
        list_of_dictionaries.append(dictionary_aux)    
            

    #################################
    # GENERATION OF NEW COLUMNS BASED ON
    # AUXILIARY MODELS
        
    list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
    list_of_rows_dataframe_new = []
                          
    new_columns = []

    # For each value of the target
    for case in range(0, len(list_unique_values_target)):
                              
        dictionary_case = list_of_dictionaries[case]
                                    
        for fict_target in dictionary_case:
                                    
            X = data.drop(columns=[target_variable, fict_target])
            y_predicted = dictionary_case[fict_target].predict(X)
            y_real = data[fict_target]  

            if (sqrt_error):
                mse = (y_real - y_predicted) ** 2                            
                rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
            else:    
                rmse = (y_real - y_predicted) 
                rmse = [round(row_mse_value, 4) for row_mse_value in rmse]
                          
            # Add column to list of new columns
            new_columns.append(rmse)

        
    dataframe_new = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            names_cols_dataframe_new.append(u + "_" + str(case))
    dataframe_new.columns = names_cols_dataframe_new

    cols_a = data.columns.to_list()
    cols_b = dataframe_new.columns.to_list()
                            
    data = data.reset_index(drop=True)
    dataframe_new = dataframe_new.reset_index(drop=True)
                           
    # Concatenate horizontally
    result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
    result_df.columns = cols_b + cols_a    
                            
    #################################
    # SELECTION OF NEW COLUMNS CREATED TO CREATE
    # AUGMENTED DF.

    if (selection_method == "feature_imp_normal"):
        # Calculations for later selection of new columns
        # Compute permutation importance using the training set
        perm_importance = permutation_importance(rf_normal, X_normal, y_normal, n_repeats=10, random_state=42)
        # Extract mean importances and store in a variable
        feature_importances = perm_importance.importances_mean
        
        # Create a list of (feature_name, importance) tuples
        feature_importance_tuples = zip(X_normal.columns, feature_importances)
        # Sort the tuples based on importance
        sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
        # Extract sorted feature names
        sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
        # Select some features
        base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names) * frac_feature_imp_normal)]
        
        selected_columns = [] 
        for aux_base_cols in base_selected_columns:
            selected_columns.append(aux_base_cols + "_0")
            selected_columns.append(aux_base_cols + "_1")
        
    else:          
        if (selection_method == "correlation_target"):

            # Obtain new variables that have at least certain absolute correlation with the target
            selected_columns = calculate_significant_features_mi(result_df, target_variable, new_cols_corr_thr)
            selected_columns = [w for w in selected_columns if (w in dataframe_new.columns)]
            
        else:
            if (selection_method is None):
                # Select all created new columns
                selected_columns = dataframe_new.columns.tolist()            
                
            else:
                print("Error. The parameter selection_method must be 'feature_imp_normal', 'correlation_target' or None")
                sys.exit()
                    

    # The augmented dataframe will contain the original columns plus the 
    # selected new columns
    result_df = result_df.loc[:, data.columns.tolist() + selected_columns]

                            

    #################################
    # TRAINING OF MODEL BASED ON AUGMENTED DF. 

    features = result_df.drop(target_variable, axis=1)
    target = result_df[target_variable]

    if (shap_treatment):
        # Note: The selected new columns 
        # usually render lower performance if class_weight='balanced' is applied
        # in the model. So, theoretically the best option would be to apply
        # class_weight='balanced' to the original df columns, but the application
        # of class_weight='balanced' only to specific columns is currently not
        # supported in scikit-learn. As a workaround, an auxiliary model (aux_model_W)
        # is trained only with the original df columns using class_weight='balanced' .
        # Then, the SHAP contributions from this auxiliary model are extracted and
        # added to selected_columns and from that dataset a model is fit without
        # class_weight='balanced'. The goal is that the final model incorporates the
        # class balance application to the original df columns (at least implicitely
        # via their SHAP contributions) but not to the new columns
        features_with_balance = features.drop(columns=selected_columns)
        features_without_balance = features[selected_columns]
            
        # Train the auxiliary model for handling imbalance
        aux_model_W = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
        aux_model_W.fit(features_with_balance, target)    
        
        # Initialize SHAP explainer
        explainer = shap.TreeExplainer(aux_model_W)
        
        # Calculate SHAP values for the entire dataset
        shap_values = explainer.shap_values(features_with_balance)
        
        # Extract the SHAP values for class 1 (assuming binary classification)
        shap_values = np.array(shap_values) 
        # Class 1 is the second element in the most inner array, and since in Python
        # notation starts at 0 this would be 1
        shap_values_class_1 = shap_values[:, :, 1]
        
        # Ensure the shape of SHAP values matches features_with_balance
        if (shap_values_class_1.shape == features_with_balance.shape):
            # Convert SHAP values to a DataFrame
            shap_values_df = pd.DataFrame(shap_values_class_1, columns=features_with_balance.columns)
        else:
            raise ValueError("Shape mismatch between SHAP values and features_with_balance_scaled")
    
        
        # Extend the original features with SHAP values
        features = pd.concat([features_without_balance, shap_values_df], axis=1)

    # Train the model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)    
    # Define the parameter grid for grid search
    param_grid = {
        'min_samples_split': [5, 10, 15],
        'min_samples_leaf': [5, 10]
    }
    # Perform grid search with F1 score as the metric
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                               scoring=make_scorer(f1_score, average='weighted'), cv=5, n_jobs=-1)
    grid_search.fit(features, target)
    # Get the best model
    rf_model = grid_search.best_estimator_
    
    #################################
    # PROCESSING OF TEST DATASET SO THAT THE TRAINED MODEL
    # CAN BE APPLIED TO IT, AND IN THIS WAY OBTAIN 
    # PERFORMANCE METRICS
    
    # For each value of the target
    new_columns = []
    for case in range(0, len(list_unique_values_target)):
                                
        dictionary_case = list_of_dictionaries[case]
        
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                X = data_test.drop(columns=[target_variable, fict_target])
                y_predicted = dictionary_case[fict_target].predict(X)
                y_real = data_test[fict_target]  

                if (sqrt_error):
                    mse = (y_real - y_predicted) ** 2                            
                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                else:    
                    rmse = (y_real - y_predicted) 
                    rmse = [round(row_mse_value, 4) for row_mse_value in rmse]
                                
                # Add column to list of new columns
                new_columns.append(rmse)
                            
    dataframe_new2 = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new2 = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):    
                names_cols_dataframe_new2.append(fict_target + "_" + str(case))
    dataframe_new2.columns = names_cols_dataframe_new2
                            
    cols_a = data_test.columns.to_list()
    cols_b = dataframe_new2.columns.to_list()
                            
    data_test = data_test.reset_index(drop=True)
    dataframe_new2 = dataframe_new2.reset_index(drop=True)
                                                    
    # Concatenate horizontally
    data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
    data_test_processed.columns = cols_b + cols_a

    features_test = data_test_processed.drop(target_variable, axis=1)
    features_test = features_test.loc[:, features.columns.tolist()] 
    
    target_test = data_test_processed[target_variable]

    if (shap_treatment):
        # Obtain SHAP contributions trained on model with class_weight='balanced'
        def transform_new_data(new_data, aux_model_W, explainer, selected_columns):
            features_with_balance_new = new_data.drop(columns=selected_columns)
            features_without_balance_new = new_data[selected_columns]
        
            # Get SHAP values for the entire dataset
            shap_values = explainer.shap_values(features_with_balance_new)
        
            shap_explanations_new = []
            for i in range(features_with_balance_new.shape[0]):
                explanation_dict = dict(zip(features_with_balance_new.columns, shap_values[i]))
                shap_explanations_new.append([explanation_dict.get(col, 0) for col in features_with_balance_new.columns])
        
            shap_contributions_new = pd.DataFrame(shap_explanations_new, columns=[f'shap_{col}' for col in features_with_balance_new.columns])        
            shap_contributions_new.columns = features_with_balance_new.columns
            # Extract the second element in each array (which is the shap
            # value for class 1)
            shap_contributions_new = shap_contributions_new.apply(lambda col: col.map(lambda x: x[1]))
            # Concatenate columns in same order it was done for training the shap model
            extended_features_new = pd.concat([features_without_balance_new, shap_contributions_new], axis=1)
        
            return extended_features_new
        
        # Initialize the TreeExplainer
        explainer = shap.TreeExplainer(aux_model_W)
        # Obtain features_test
        features_test = transform_new_data(features_test, aux_model_W, explainer, selected_columns)
                             
    # Now apply trained model on test dataset to gauge performance
    predictions = rf_model.predict(features_test)
                           
    # Calculate F1 score
    f1 = f1_score(target_test, predictions, average=average_metric_type)
    return (f1)






##########################################################################################################################################

# TEST ON DATAFRAMES FROM UCI, DOWNLOAD ADAPTED FROM Perales-González, Carlos, (2020). UCI download-process, v1.3, GitHub repository, https://github.com/cperales/uci-download-process

# Number of statistical repetitions
num_stat_rep = 10

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]

for i in files_classification:
    try:
        if (i == files_classification[0]): # Big dataset (slow for tests)
            continue
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            try:
                data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                data.columns = aux_names_columns
                    
                # If there will be enough dimensionality after one hot encoding
                data_check = pd.get_dummies(data)
                if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                    # Name of the target of the dataset (target_index - 1 since 
                    # in python first position is 0)
                    target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                    # If it is a binary classification
                    unique_values_count = data[target_variable].nunique()
                    if (unique_values_count == 2):
                        
                        normal_performance = []
                        for c in range(0, num_stat_rep):
                            normal_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                           only_normal=True, seed=c))

                        normal_performance_stdev = statistics.stdev(normal_performance)
                        normal_performance = statistics.mean(normal_performance)
                        print("Normal performance: " + str(round(normal_performance, 4)) + " +- " + str(round(normal_performance_stdev, 4)))
                        
                        cases_new = ["type_aux_mode=linear; selection_method_case=None; shap_treatment=False; sqrt_error=False"]
                        cases_new.append("type_aux_mode=linear; selection_method_case=None; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=True; sqrt_error=False")                
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=False; sqrt_error=False")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=True; sqrt_error=False")
                        cases_new.append("type_aux_mode=linear; selection_method_case=None; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1; shap_treatment=True; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=False; sqrt_error=True")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2; shap_treatment=True; sqrt_error=True")

                        cases_new_performances = []
                        cases_new_performances_stdev = []
                        
                        
                        for case in cases_new:

                            if("shap_treatment=True" in case):
                                shap_treatm = True
                            else:
                                shap_treatm = False
                            
                            if ("linear" in case):
                                model_aux = "linear"
                            else:
                                model_aux = "randomforest"

                            if ("None" in case):
                                new_performance = []
                                for c in range(0, num_stat_rep):                            
                                    new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                   type_aux_mod=model_aux, selection_method=None,
                                                                   shap_treatment=shap_treatm, seed=c))
                                new_performance_stdev = statistics.stdev(new_performance)    
                                cases_new_performances_stdev.append(new_performance_stdev)
                                new_performance = statistics.mean(new_performance)        
                                cases_new_performances.append(new_performance)
                                
                            else:     
                                if ("feature_imp_normal" in case):
                                    frac_feature_imp_normal_value = float(case.split("frac_feature_imp_normal=")[1].split(";")[0])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                      type_aux_mod=model_aux, selection_method="feature_imp_normal", 
                                                                      frac_feature_imp_normal=frac_feature_imp_normal_value,
                                                                      shap_treatment=shap_treatm, seed=c))
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                else:
                                    new_cols_corr_thr_value = float(case.split("new_cols_corr_thr=")[1].split(";")[0])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                     type_aux_mod=model_aux, selection_method="correlation_target", 
                                                                     new_cols_corr_thr=new_cols_corr_thr_value, 
                                                                     shap_treatment=shap_treatm, seed=c))
                                        
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                    
                            print(case)
                            print(str(round(new_performance, 4)) + " +- " + str(round(new_performance_stdev, 4)))
                            

                        # Save result
                        result = pd.DataFrame({"Case": cases_new, "Mean F1-score": cases_new_performances, "Stdev F1-score": cases_new_performances_stdev})
                        result["Normal Mean F1-score"] = normal_performance
                        result["Normal stdev F1-score"] = normal_performance_stdev
                        result = result.sort_values(by="Mean F1-score", ascending=False)
                        data_name = data_url.split("/")
                        data_name = data_name[len(data_name) - 1]
                        display(result)
                        result.to_csv("metrics_" + data_name + ".csv", index=False)
                        
                        # Get maximum performance of the new cases
                        if (max(cases_new_performances) > normal_performance):
                            
                            print("New performance: " + str(round(max(cases_new_performances), 3)))
                            # Print details
                            max_index = cases_new_performances.index(max(cases_new_performances))    
                            print("Optimum case new: " + str(cases_new[max_index]))
                        else:
                            print("No improvement found")                        
                                                        
            
            except:      
                print("Dataset " + i + " could not be processed.")
                    
            
    except FileNotFoundError:  
        print(f"The file '{i}' does not exist.")




Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small+adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt...
File downloaded successfully to dataset_fi