In [1]:
# Usando modelos auxiliares lineales

import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a
                            
                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data_test.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data_test[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.66268062 0.91267896]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [1]:
# Usando modelos auxiliares de RF

import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a
                            
                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data_test.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data_test[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.64616886 0.90597847]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [None]:
# Se puede observar que dependiendo de si se usan modelos auxiliares lineales o de RF
# se obtienen unos resultados u otros --> Para cada dataset habrá que ver cuál es la mejor
# opción

In [None]:
# Pruebas añadiendo como nuevas columnas solo aquellas que estén
# por encima de un cierto umbral en su correlación con la target.
# Por lo menos estableciendo un threshold del 20%, en la mayoría
# de los casos empeora, pero en algunos se obtiene una mejora
# significativa --> Para cada dataset habrá que ver cuál es la mejor
# opción, o quizá buscar otra forma de probar con distintas nuevas
# columnas

# Para todas estas pruebas además habrá que realizar varias repeticiones
# estadísticas para obtener datos más fiables

In [15]:
# Usando modelos auxiliares lineales y threshold de 20%
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            selected_columns = selected_columns.tolist()
                            print(selected_columns)
                            print(len(selected_columns))
                            print(result_df.shape)
                            
                            result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
[]
0
(27656, 325)
F1 Score: [0.67607004 0.9135514 ]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded s

In [16]:
# Usando modelos auxiliares de RF y threshold de 20%
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            selected_columns = selected_columns.tolist()
                            print(selected_columns)
                            print(len(selected_columns))
                            print(result_df.shape)
                            
                            result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
['39_0', ' 13_1', ' Bachelors_ 12th_1', ' Bachelors_ Assoc-voc_0', ' Bachelors_ Preschool_0', ' Not-in-family_ Other-relative_1']
6
(27656, 325)
F1 Score: [0.6651439 0.9031832]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Do

In [None]:
# Pruebas realizando un feature importance de un modelo generado solo con las
# variables nuevas, y extrayendo el 10% de las variables nuevas con mayor 
# feature importance para añadir al dataset. Esta estrategia por lo general
# parece poco efectiva, a diferencia de la selección por correlación con la
# target, donde a veces (en el caso de los modelos lineales) sí se apreciaba
# una ventaja.

In [20]:
# Usando modelos auxiliares lineales
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            cols_b = dataframe_new.columns.to_list()
                            
                            
                            # Concatenate horizontally
                            aux_df = pd.concat([dataframe_new, data[[target_variable]]], axis=1, ignore_index=True)
                            aux_df.columns = cols_b + [target_variable]
                            
                            features = aux_df.drop(target_variable, axis=1)
                            target = aux_df[target_variable]
                            rf_model_aux = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split=10, min_samples_leaf=5)
                            rf_model_aux.fit(features, target)
                            
                            feature_importances = rf_model_aux.feature_importances_
                            
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(features.columns, feature_importances)
                                                        
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]

                            # Select some features
                            selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.52893519 0.89865538]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [18]:
# Usando modelos auxiliares de RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            cols_b = dataframe_new.columns.to_list()
                            
                            
                            # Concatenate horizontally
                            aux_df = pd.concat([dataframe_new, data[[target_variable]]], axis=1, ignore_index=True)
                            aux_df.columns = cols_b + [target_variable]
                            
                            features = aux_df.drop(target_variable, axis=1)
                            target = aux_df[target_variable]
                            rf_model_aux = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split=10, min_samples_leaf=5)
                            rf_model_aux.fit(features, target)
                            
                            feature_importances = rf_model_aux.feature_importances_
                            
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(features.columns, feature_importances)
                                                        
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]

                            # Select some features
                            selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.58933195 0.89870992]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [None]:
# Selección de nuevas columnas en base a la importancia de las variables
# del modelo normal. En concreto, se toman como base el 10% de las variables
# con más importancia en el modelo normal y luego se seleccionan para cada
# una de ellas la variable nueva terminada en "_0" y la terminada en "_1".

# Al menos usando modelos auxiliares lineales, se observa algunos casos donde
# mejora, como en el breast cancer dataset

In [2]:
# Usando modelos auxiliares lineales
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.64977192 0.91128515]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [3]:
# Usando modelos auxiliares RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.67910751 0.89159929]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [None]:
# Original but choosing for each auxiliary model
# the best performance (linear vs RF). It seems
# that this giving in general poor results

In [6]:
# Usando modelos auxiliares RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:

                                        # First try RF
                                        aux_model_RF = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model_RF.fit(X_train, y_train)
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model_RF.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared_RF = 1
                                        else:    
                                            r_squared_RF = 1 - (rss / tss)    

                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    

  
                                        # Now try linear regression
                                        aux_model_linear = LinearRegression()
                                        aux_model_linear.fit(X_train, y_train)
                                        
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model_linear.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared_linear  = 1
                                        else:    
                                            r_squared_linear = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    

                                        if (r_squared_RF > r_squared_linear):
                                            dictionary_aux[fict_target] = aux_model_RF
                                            dictionary_aux_r_squared[fict_target] = r_squared_RF
                                        else:
                                            dictionary_aux[fict_target] = aux_model_linear
                                            dictionary_aux_r_squared[fict_target] = r_squared_linear
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            # dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    # if ((fict_target + '_' + str(case)) in selected_columns):
                                    if True:
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    # if ((u + '_' + str(case)) in selected_columns):    
                                    names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.60213904 0.90572732]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [None]:
# Testing multiple hyperparameters (type of auxiliary model, selection of new variables).


In [66]:
import os
import sys
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import statistics


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


def aux_func(data, target_variable, only_normal=False, type_aux_mod="linear", selection_method=None, 
             frac_feature_imp_normal=0.1, new_cols_corr_thr=0.2, seed=123):
    
    #################################
    # PREPROCESSING
    data = data.drop_duplicates()
                            
    # Handling missing values (drop rows with missing values for simplicity)
    data.dropna(inplace=True)
    
    # Encoding categorical variables using one-hot encoding (OHE).
    # First convert the target into categorical. This facilitates later
    # calculations
    data[target_variable] = data[target_variable].astype("category")
    # Now apply OHE to categorical columns (including the target)
    data = pd.get_dummies(data)

    # After OHE there will be two columns created for the target (since it was binary).
    # Choose one of those columns as target and discard the other one. It is not important
    # which of the two is selected as target, since the F1-score performance is measured
    # using 'macro' average option
    aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
    target_variable = aux_names_target[0]
    data = data.drop(columns=[aux_names_target[1]])
                            
    # Normalizing numerical variables. Since previously OHE
    # was performed, all variables with more than 2 different
    # values will be numerical
    numerical_columns = []
    non_numerical_columns = []
    for aux_num in data.columns:
        unique_values_count = data[aux_num].nunique()
        if (unique_values_count > 2):
            numerical_columns.append(aux_num)
        else:
            non_numerical_columns.append(aux_num)
    # Normalization
    if (len(numerical_columns) != 0):
        scaler = StandardScaler()
        aux_data = scaler.fit_transform(data[numerical_columns])
        aux_data = pd.DataFrame(aux_data, columns=numerical_columns)
        aux_data = aux_data.reset_index(drop=True)
        data_non_numerical = data[non_numerical_columns].reset_index(drop=True)
        data = pd.concat([aux_data, data_non_numerical], axis=1)
                                                                                

    # Shuffle the DataFrame to randomize the rows
    data = data.sample(frac=1, random_state=seed)  
                            
    # Save some registers for testing performance:
    data_test = data.sample(frac=0.15, random_state=42)
    data = data.drop(data_test.index)

    # Calculate class weights
    class_counts = data[target_variable].value_counts()
    total_samples = class_counts.sum()
    class_weights = {cls: total_samples / count for cls, count in class_counts.items()}

    # Define custom scorer
    average_metric_type = 'macro'
    custom_scorer = make_scorer(f1_score, average=average_metric_type)

    #################################
    # NORMAL MODEL

    # If the function only returns the normal performance or the selection
    # method of new variables is feature_imp_normal
    if ((only_normal) | (selection_method is not None)):

        X_normal = data.drop(columns=[target_variable])
        y_normal = data[target_variable] 

        # Define RandomForestClassifier
        rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
        # Define the parameter grid for grid search
        min_samples_split_grid_search = [5, 10, 15]
        min_samples_leaf_grid_search = [5, 10, 15]
        param_grid = {
            'min_samples_split': min_samples_split_grid_search,
            'min_samples_leaf' : min_samples_leaf_grid_search
        }
        
        # Perform grid search        
        grid_search = GridSearchCV(estimator=rf_normal, param_grid=param_grid,
                                   scoring=custom_scorer, cv=5, n_jobs=-1)
        # The normal model takes imbalance into account to generate 
        # maximum normal performance
        grid_search.fit(X_normal, y_normal, sample_weight=y_normal.map(class_weights))
        
        # Get the best normal model
        rf_normal = grid_search.best_estimator_

        # If the call to the function was to just calculate the
        # normal performance
        if (only_normal):
            features_test_normal = data_test.drop(target_variable, axis=1)
            target_test_normal = data_test[target_variable]
            predictions_normal = rf_normal.predict(features_test_normal)
            normal_score = f1_score(target_test_normal, predictions_normal, average=average_metric_type)
                           
            return (normal_score)
    
    
    #######################################
        
                                                
    #################################
    # GENERATION OF AUXILIARY MODELS
        
    # List of dictionaries
    list_of_dictionaries = []
                            
    # For each value of the target
    for target_value in sorted(list(data[target_variable].unique())):
                                
        # Generate auxiliary dataset
        dataset_aux = data[data[target_variable] == target_value]
                                
        # Discard target in auxiliary dataset
        dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
        # Generate dictionary of ficticious targets and the models that predict them:
        dictionary_aux = {}
                                
        for fict_target in dataset_aux.columns.tolist():
        
            # Train auxiliary model and save it
            X = dataset_aux.drop(columns=[fict_target])
            y = dataset_aux[fict_target]                                 

            if (type_aux_mod == "linear"):
                aux_model = LinearRegression(n_jobs=-1)
            else:
                if (type_aux_mod == "randomforest"):
                    aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5, n_jobs=-1)
                else:
                    print("Error: current allowed type for auxiliary models is 'linear' or 'randomforest'")
                    sys.exit()
            
            aux_model.fit(X, y)
                                        
            dictionary_aux[fict_target] = aux_model
                                        
                                    
        list_of_dictionaries.append(dictionary_aux)    
            

    #################################
    # GENERATION OF NEW COLUMNS BASED ON
    # AUXILIARY MODELS
        
    list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
    list_of_rows_dataframe_new = []
                          
    new_columns = []

    # For each value of the target
    for case in range(0, len(list_unique_values_target)):
                              
        dictionary_case = list_of_dictionaries[case]
                                    
        for fict_target in dictionary_case:
                                    
            X = data.drop(columns=[target_variable, fict_target])
            y_predicted = dictionary_case[fict_target].predict(X)
            y_real = data[fict_target]  
                            
            mse = (y_real - y_predicted) ** 2
                            
            rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                          
            # Add column to list of new columns
            new_columns.append(rmse)

        
    dataframe_new = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            names_cols_dataframe_new.append(u + "_" + str(case))
    dataframe_new.columns = names_cols_dataframe_new

    cols_a = data.columns.to_list()
    cols_b = dataframe_new.columns.to_list()
                            
    data = data.reset_index(drop=True)
    dataframe_new = dataframe_new.reset_index(drop=True)
                           
    # Concatenate horizontally
    result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
    result_df.columns = cols_b + cols_a    
                            
    #################################
    # SELECTION OF NEW COLUMNS CREATED TO CREATE
    # AUGMENTED DF.

    if (selection_method == "feature_imp_normal"):
        # Calculations for later selection of new columns
        feature_importances = rf_normal.feature_importances_
        # Create a list of (feature_name, importance) tuples
        feature_importance_tuples = zip(X_normal.columns, feature_importances)
        # Sort the tuples based on importance
        sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
        # Extract sorted feature names
        sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
        # Select some features
        base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names) * frac_feature_imp_normal)]
        
        selected_columns = [] 
        for aux_base_cols in base_selected_columns:
            selected_columns.append(aux_base_cols + "_0")
            selected_columns.append(aux_base_cols + "_1")
        
    else:          
        if (selection_method == "correlation_target"):
            # Obtain new variables that have at least certain absolute correlation with the target
            correlation = result_df.corr()[target_variable].abs()
            # Select columns that have a at least certain absolute correlation with the target
            selected_columns = correlation[correlation >= new_cols_corr_thr].index
            selected_columns = [w for w in selected_columns if (w in dataframe_new.columns)]
            
        else:
            if (selection_method is None):
                # Select all created new columns
                selected_columns = dataframe_new.columns.tolist()            
                
            else:
                print("Error. The parameter selection_method must be 'feature_imp_normal', 'correlation_target' or None")
                sys.exit()
                    

    # The augmented dataframe will contain the original columns plus the 
    # selected new columns
    result_df = result_df.loc[:, data.columns.tolist() + selected_columns]

                            

    #################################
    # TRAINING OF MODEL BASED ON AUGMENTED DF.

    features = result_df.drop(target_variable, axis=1)
    target = result_df[target_variable]
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
    # Define the parameter grid for grid search
    min_samples_split_grid_search = [5, 10, 15]
    min_samples_leaf_grid_search = [5, 10, 15]
    param_grid = {
        'min_samples_split': min_samples_split_grid_search,
        'min_samples_leaf' : min_samples_leaf_grid_search
    }
        
    # Perform grid search
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                              scoring=custom_scorer, cv=5, n_jobs=-1)
    # It is not advised to apply imbalance correction to the enhanced dataframe
    # grid_search.fit(features, target, sample_weight=target.map(class_weights))
    # Instead, normal fit is advisable to generate better performance
    grid_search.fit(features, target)
    
    # Get the best model and corresponding score
    rf_model = grid_search.best_estimator_
                            

    #################################
    # PROCESSING OF TEST DATASET SO THAT THE TRAINED MODEL
    # CAN BE APPLIED TO IT, AND IN THIS WAY OBTAIN 
    # PERFORMANCE METRICS
    
    # For each value of the target
    new_columns = []
    for case in range(0, len(list_unique_values_target)):
                                
        dictionary_case = list_of_dictionaries[case]
        
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                X = data_test.drop(columns=[target_variable, fict_target])
                y_predicted = dictionary_case[fict_target].predict(X)
                y_real = data_test[fict_target]  
                               
                mse = (y_real - y_predicted) ** 2
                                
                rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                
                # Add column to list of new columns
                new_columns.append(rmse)
                            
    dataframe_new2 = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new2 = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):    
                names_cols_dataframe_new2.append(fict_target + "_" + str(case))
    dataframe_new2.columns = names_cols_dataframe_new2
                            
    cols_a = data_test.columns.to_list()
    cols_b = dataframe_new2.columns.to_list()
                            
    data_test = data_test.reset_index(drop=True)
    dataframe_new2 = dataframe_new2.reset_index(drop=True)
                                                    
    # Concatenate horizontally
    data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
    data_test_processed.columns = cols_b + cols_a
    # Adapt order of columns to the order in which the model was
    # trained
    data_test_processed = data_test_processed[result_df.columns.to_list()]
                         
    # Now apply trained model on test dataset to gauge performance
    features_test = data_test_processed.drop(target_variable, axis=1)
    target_test = data_test_processed[target_variable]
    predictions = rf_model.predict(features_test)
                           
    # Calculate F1 score
    f1 = f1_score(target_test, predictions, average=average_metric_type)
    return (f1)






##########################################################################################################################################

# TEST ON DATAFRAMES FROM UCI, DOWNLOAD ADAPTED FROM Perales-González, Carlos, (2020). UCI download-process, v1.3, GitHub repository, https://github.com/cperales/uci-download-process

# Number of statistical repetitions
num_stat_rep = 10

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]

for i in files_classification:
    try:
        
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            try:
                data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                data.columns = aux_names_columns
                    
                # If there will be enough dimensionality after one hot encoding
                data_check = pd.get_dummies(data)
                if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                    # Name of the target of the dataset (target_index - 1 since 
                    # in python first position is 0)
                    target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                    # If it is a binary classification
                    unique_values_count = data[target_variable].nunique()
                    if (unique_values_count == 2):
                        
                        normal_performance = []
                        for c in range(0, num_stat_rep):
                            normal_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                           only_normal=True, seed=c))

                        normal_performance_stdev = statistics.stdev(normal_performance)
                        normal_performance = statistics.mean(normal_performance)
                        print("Normal performance: " + str(round(normal_performance, 4)) + " +- " + str(round(normal_performance_stdev, 4)))
                        
                        cases_new = ["type_aux_mode=linear; selection_method_case=None"]
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2")

                        cases_new_performances = []
                        cases_new_performances_stdev = []
                        
                        
                        for case in cases_new:
                            
                            if ("linear" in case):
                                model_aux = "linear"
                            else:
                                model_aux = "randomforest"

                            if ("None" in case):
                                new_performance = []
                                for c in range(0, num_stat_rep):                            
                                    new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                   type_aux_mod=model_aux, selection_method=None,
                                                                    seed=c))
                                new_performance_stdev = statistics.stdev(new_performance)    
                                cases_new_performances_stdev.append(new_performance_stdev)
                                new_performance = statistics.mean(new_performance)        
                                cases_new_performances.append(new_performance)
                                
                            else:     
                                if ("feature_imp_normal" in case):
                                    frac_feature_imp_normal_value = float(case.split("frac_feature_imp_normal=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                      type_aux_mod=model_aux, selection_method="feature_imp_normal", 
                                                                      frac_feature_imp_normal=frac_feature_imp_normal_value,
                                                                      seed=c))
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                else:
                                    new_cols_corr_thr_value = float(case.split("new_cols_corr_thr=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                     type_aux_mod=model_aux, selection_method="correlation_target", 
                                                                     new_cols_corr_thr=new_cols_corr_thr_value, 
                                                                     seed=c))
                                        
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                    
                            print(case)
                            print(str(round(new_performance, 4)) + " +- " + str(round(new_performance_stdev, 4)))
                            

                        # Save result
                        result = pd.DataFrame({"Case": cases_new, "Mean F1-score": cases_new_performances, "Stdev F1-score": cases_new_performances_stdev})
                        result["Normal Mean F1-score"] = normal_performance
                        result["Normal stdev F1-score"] = normal_performance_stdev
                        result = result.sort_values(by="Mean F1-score", ascending=False)
                        data_name = data_url.split("/")
                        data_name = data_name[len(data_name) - 1]
                        display(result)
                        result.to_csv("metrics_" + data_name + ".csv", index=False)
                        
                        # Get maximum performance of the new cases
                        if (max(cases_new_performances) > normal_performance):
                            
                            print("New performance: " + str(round(max(cases_new_performances), 3)))
                            # Print details
                            max_index = cases_new_performances.index(max(cases_new_performances))    
                            print("Optimum case new: " + str(cases_new[max_index]))
                        else:
                            print("No improvement found")                        
                                                        
            
            except:   
                print("Dataset " + i + " could not be processed.")
                    
            
    except FileNotFoundError:  
        print(f"The file '{i}' does not exist.")




Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.7824 +- 0.0037
type_aux_mode=linear; selection_method_case=None
0.7876 +- 0.0063
type_aux_mode=randomforest; selection_method_case=None
0.776 +- 0.0056
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.7908 +- 0.0066
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.7935 +- 0.0061
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.7915 +- 0.0055
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7
0.7888 +- 0.0067
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.774 +- 0.0082
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.7763 +- 0.0077
type_au



type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2
0.779 +- 0.0073


Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
10,type_aux_mode_case=linear; selection_method_ca...,0.795363,0.006742,0.782361,0.00365
11,type_aux_mode_case=linear; selection_method_ca...,0.793569,0.00562,0.782361,0.00365
3,type_aux_mode=linear; selection_method_case=fe...,0.793462,0.006128,0.782361,0.00365
4,type_aux_mode=linear; selection_method_case=fe...,0.791547,0.00553,0.782361,0.00365
2,type_aux_mode=linear; selection_method_case=fe...,0.790843,0.006596,0.782361,0.00365
5,type_aux_mode=linear; selection_method_case=fe...,0.788783,0.006679,0.782361,0.00365
0,type_aux_mode=linear; selection_method_case=None,0.78756,0.006325,0.782361,0.00365
13,type_aux_mode_case=randomforest; selection_met...,0.778967,0.007253,0.782361,0.00365
7,type_aux_mode=randomforest; selection_method_c...,0.776307,0.00766,0.782361,0.00365
1,type_aux_mode=randomforest; selection_method_c...,0.776012,0.005648,0.782361,0.00365


New performance: 0.795
Optimum case new: type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small+adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://ar

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
1,type_aux_mode=randomforest; selection_method_c...,0.998486,0.002438,0.991955,0.007678
12,type_aux_mode_case=randomforest; selection_met...,0.998486,0.002438,0.991955,0.007678
13,type_aux_mode_case=randomforest; selection_met...,0.998486,0.002438,0.991955,0.007678
8,type_aux_mode=randomforest; selection_method_c...,0.997974,0.003563,0.991955,0.007678
9,type_aux_mode=randomforest; selection_method_c...,0.997974,0.003563,0.991955,0.007678
3,type_aux_mode=linear; selection_method_case=fe...,0.997501,0.003521,0.991955,0.007678
4,type_aux_mode=linear; selection_method_case=fe...,0.997501,0.003521,0.991955,0.007678
5,type_aux_mode=linear; selection_method_case=fe...,0.997501,0.003521,0.991955,0.007678
11,type_aux_mode_case=linear; selection_method_ca...,0.996507,0.004711,0.991955,0.007678
0,type_aux_mode=linear; selection_method_case=None,0.9965,0.003361,0.991955,0.007678


New performance: 0.998
Optimum case new: type_aux_mode=randomforest; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.6788 +- 0.0315
type_aux_mode=linear; selection_method_case=None
0.6025 +- 0.0623
type_aux_mode=randomforest; selection_method_case=None
0.6284 +- 0.0435
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.6152 +- 0.0336
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.6277 +- 0.0521
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.6117 +- 0.061
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7
0.6117 +- 0.061
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.6152 +- 0.0336
type_aux_

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
13,type_aux_mode_case=randomforest; selection_met...,0.641212,0.065373,0.678828,0.031536
1,type_aux_mode=randomforest; selection_method_c...,0.628433,0.043542,0.678828,0.031536
3,type_aux_mode=linear; selection_method_case=fe...,0.627715,0.052091,0.678828,0.031536
7,type_aux_mode=randomforest; selection_method_c...,0.624614,0.035812,0.678828,0.031536
2,type_aux_mode=linear; selection_method_case=fe...,0.615211,0.033604,0.678828,0.031536
6,type_aux_mode=randomforest; selection_method_c...,0.615211,0.033604,0.678828,0.031536
12,type_aux_mode_case=randomforest; selection_met...,0.613056,0.040687,0.678828,0.031536
4,type_aux_mode=linear; selection_method_case=fe...,0.611704,0.060978,0.678828,0.031536
5,type_aux_mode=linear; selection_method_case=fe...,0.611704,0.060978,0.678828,0.031536
8,type_aux_mode=randomforest; selection_method_c...,0.609923,0.04467,0.678828,0.031536


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.963 +- 0.0171
type_aux_mode=linear; selection_method_case=None
0.9674 +- 0.0129
type_aux_mode=randomforest; selection_method_case=None
0.9725 +- 0.0102
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.9596 +- 0.0136
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.9619 +- 0.0145
type_aux_mode=linear; selection_method_case=feature_imp_nor

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
1,type_aux_mode=randomforest; selection_method_c...,0.9725,0.010152,0.962994,0.017077
13,type_aux_mode_case=randomforest; selection_met...,0.972491,0.011458,0.962994,0.017077
12,type_aux_mode_case=randomforest; selection_met...,0.970377,0.011089,0.962994,0.017077
9,type_aux_mode=randomforest; selection_method_c...,0.968351,0.012409,0.962994,0.017077
8,type_aux_mode=randomforest; selection_method_c...,0.967469,0.012784,0.962994,0.017077
0,type_aux_mode=linear; selection_method_case=None,0.967364,0.012898,0.962994,0.017077
11,type_aux_mode_case=linear; selection_method_ca...,0.964216,0.011639,0.962994,0.017077
5,type_aux_mode=linear; selection_method_case=fe...,0.963116,0.013744,0.962994,0.017077
7,type_aux_mode=randomforest; selection_method_c...,0.963019,0.012593,0.962994,0.017077
4,type_aux_mode=linear; selection_method_case=fe...,0.962087,0.013695,0.962994,0.017077


New performance: 0.973
Optimum case new: type_aux_mode=randomforest; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.9818 +- 0.0085
type_aux_mode=linear; selection_method_case=None
0.9765 +- 0.0102
type_aux_mode=randomforest; selection_method_case=None
0.9784 +- 0.0096
type_aux_mode=linear; selection_method_case=feature_imp_nor

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
12,type_aux_mode_case=randomforest; selection_met...,0.982818,0.009266,0.981764,0.00846
2,type_aux_mode=linear; selection_method_case=fe...,0.9805,0.008253,0.981764,0.00846
6,type_aux_mode=randomforest; selection_method_c...,0.979246,0.00941,0.981764,0.00846
11,type_aux_mode_case=linear; selection_method_ca...,0.979037,0.008137,0.981764,0.00846
13,type_aux_mode_case=randomforest; selection_met...,0.979037,0.008137,0.981764,0.00846
10,type_aux_mode_case=linear; selection_method_ca...,0.978824,0.009212,0.981764,0.00846
8,type_aux_mode=randomforest; selection_method_c...,0.97863,0.009979,0.981764,0.00846
3,type_aux_mode=linear; selection_method_case=fe...,0.978625,0.010858,0.981764,0.00846
1,type_aux_mode=randomforest; selection_method_c...,0.978429,0.009567,0.981764,0.00846
9,type_aux_mode=randomforest; selection_method_c...,0.978418,0.007891,0.981764,0.00846


New performance: 0.983
Optimum case new: type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00233/CNAE-9.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/vowel/vowel-context.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data...
File downloaded successfully to dataset_file_au

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
2,type_aux_mode=linear; selection_method_case=fe...,0.999783,0.000349,0.999711,0.000374
11,type_aux_mode_case=linear; selection_method_ca...,0.999783,0.000349,0.999711,0.000374
6,type_aux_mode=randomforest; selection_method_c...,0.999711,0.000505,0.999711,0.000374
3,type_aux_mode=linear; selection_method_case=fe...,0.999711,0.000374,0.999711,0.000374
7,type_aux_mode=randomforest; selection_method_c...,0.99971,0.000375,0.999711,0.000374
0,type_aux_mode=linear; selection_method_case=None,0.999639,0.00051,0.999711,0.000374
4,type_aux_mode=linear; selection_method_case=fe...,0.999639,0.00051,0.999711,0.000374
10,type_aux_mode_case=linear; selection_method_ca...,0.999639,0.00051,0.999711,0.000374
8,type_aux_mode=randomforest; selection_method_c...,0.999638,0.000381,0.999711,0.000374
9,type_aux_mode=randomforest; selection_method_c...,0.999638,0.000381,0.999711,0.000374


New performance: 1.0
Optimum case new: type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00244/fertility_Diagnosis.txt...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.6405 +- 0.1033
type_aux_mode=linear; selection_method_case=None
0.5397 +- 0.0759
type_aux_mode=randomforest; selection_method_case=None
0.5187 +- 0.0786
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.5394 +- 0.0713
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.5394 +- 0.071

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
4,type_aux_mode=linear; selection_method_case=fe...,0.583612,0.067176,0.640499,0.103315
11,type_aux_mode_case=linear; selection_method_ca...,0.582095,0.05287,0.640499,0.103315
13,type_aux_mode_case=randomforest; selection_met...,0.574805,0.103937,0.640499,0.103315
10,type_aux_mode_case=linear; selection_method_ca...,0.559097,0.060452,0.640499,0.103315
8,type_aux_mode=randomforest; selection_method_c...,0.541997,0.068772,0.640499,0.103315
9,type_aux_mode=randomforest; selection_method_c...,0.541265,0.074614,0.640499,0.103315
0,type_aux_mode=linear; selection_method_case=None,0.539732,0.07589,0.640499,0.103315
2,type_aux_mode=linear; selection_method_case=fe...,0.539376,0.071329,0.640499,0.103315
3,type_aux_mode=linear; selection_method_case=fe...,0.539376,0.071329,0.640499,0.103315
6,type_aux_mode=randomforest; selection_method_c...,0.539376,0.071329,0.640499,0.103315


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hayes-roth/hayes-roth.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_with_noise_Tra

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
9,type_aux_mode=randomforest; selection_method_c...,0.606677,0.057884,0.629257,0.052268
7,type_aux_mode=randomforest; selection_method_c...,0.60139,0.063596,0.629257,0.052268
8,type_aux_mode=randomforest; selection_method_c...,0.594648,0.038173,0.629257,0.052268
13,type_aux_mode_case=randomforest; selection_met...,0.59171,0.04829,0.629257,0.052268
6,type_aux_mode=randomforest; selection_method_c...,0.57955,0.046795,0.629257,0.052268
1,type_aux_mode=randomforest; selection_method_c...,0.577245,0.057708,0.629257,0.052268
3,type_aux_mode=linear; selection_method_case=fe...,0.563932,0.042367,0.629257,0.052268
5,type_aux_mode=linear; selection_method_case=fe...,0.560705,0.034196,0.629257,0.052268
2,type_aux_mode=linear; selection_method_case=fe...,0.558754,0.04825,0.629257,0.052268
12,type_aux_mode_case=randomforest; selection_met...,0.557468,0.045289,0.629257,0.052268


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data...
File downloaded successfully to dataset_file_aux.txt.
No

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
5,type_aux_mode=linear; selection_method_case=fe...,0.859456,0.006124,0.856379,0.007044
10,type_aux_mode_case=linear; selection_method_ca...,0.859072,0.004448,0.856379,0.007044
3,type_aux_mode=linear; selection_method_case=fe...,0.858987,0.005207,0.856379,0.007044
4,type_aux_mode=linear; selection_method_case=fe...,0.858576,0.004886,0.856379,0.007044
0,type_aux_mode=linear; selection_method_case=None,0.858103,0.00466,0.856379,0.007044
11,type_aux_mode_case=linear; selection_method_ca...,0.857954,0.00568,0.856379,0.007044
2,type_aux_mode=linear; selection_method_case=fe...,0.853276,0.006046,0.856379,0.007044
8,type_aux_mode=randomforest; selection_method_c...,0.840889,0.004936,0.856379,0.007044
9,type_aux_mode=randomforest; selection_method_c...,0.839561,0.006717,0.856379,0.007044
1,type_aux_mode=randomforest; selection_method_c...,0.836325,0.004466,0.856379,0.007044


New performance: 0.859
Optimum case new: type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-1.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-2.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-3.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 1.0 +-

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,1.0,0.0,1.0,0.0
1,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
2,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
3,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
4,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
5,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
6,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
7,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
8,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
9,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00230/plrx.txt...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archiv

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,0.997542,0.0006,0.99653,0.000692
10,type_aux_mode_case=linear; selection_method_ca...,0.997542,0.0006,0.99653,0.000692
11,type_aux_mode_case=linear; selection_method_ca...,0.997258,0.000525,0.99653,0.000692
5,type_aux_mode=linear; selection_method_case=fe...,0.99721,0.000753,0.99653,0.000692
2,type_aux_mode=linear; selection_method_case=fe...,0.99705,0.000742,0.99653,0.000692
3,type_aux_mode=linear; selection_method_case=fe...,0.99705,0.000742,0.99653,0.000692
6,type_aux_mode=randomforest; selection_method_c...,0.99705,0.000742,0.99653,0.000692
7,type_aux_mode=randomforest; selection_method_c...,0.99705,0.000742,0.99653,0.000692
4,type_aux_mode=linear; selection_method_case=fe...,0.996608,0.000792,0.99653,0.000692
1,type_aux_mode=randomforest; selection_method_c...,0.993955,0.001285,0.99653,0.000692


New performance: 0.998
Optimum case new: type_aux_mode=linear; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.9367 +- 0.0106
type_aux_mode=linear; selection_method_case=None
0.9331 +- 0.0109
type_aux_mode=randomforest; selection_method_case=None
0.9407 +- 0.0108
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.9316 +- 0.0116
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.9325 +- 0.0096
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.9351 +- 0.0104
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_norma

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
1,type_aux_mode=randomforest; selection_method_c...,0.940708,0.01084,0.936735,0.010633
8,type_aux_mode=randomforest; selection_method_c...,0.940117,0.009925,0.936735,0.010633
9,type_aux_mode=randomforest; selection_method_c...,0.939094,0.011571,0.936735,0.010633
12,type_aux_mode_case=randomforest; selection_met...,0.938246,0.010035,0.936735,0.010633
7,type_aux_mode=randomforest; selection_method_c...,0.937165,0.009067,0.936735,0.010633
13,type_aux_mode_case=randomforest; selection_met...,0.935706,0.011547,0.936735,0.010633
4,type_aux_mode=linear; selection_method_case=fe...,0.935061,0.010414,0.936735,0.010633
10,type_aux_mode_case=linear; selection_method_ca...,0.934132,0.012193,0.936735,0.010633
0,type_aux_mode=linear; selection_method_case=None,0.933097,0.010941,0.936735,0.010633
5,type_aux_mode=linear; selection_method_case=fe...,0.932568,0.010459,0.936735,0.010633


New performance: 0.941
Optimum case new: type_aux_mode=randomforest; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.train...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z...
File downloaded successfully to dataset_file_aux.txt.
Dataset C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classifica

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,0.975289,0.008928,0.954425,0.015065
2,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
3,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
4,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
5,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
10,type_aux_mode_case=linear; selection_method_ca...,0.975289,0.008928,0.954425,0.015065
11,type_aux_mode_case=linear; selection_method_ca...,0.975289,0.008928,0.954425,0.015065
12,type_aux_mode_case=randomforest; selection_met...,0.88691,0.050814,0.954425,0.015065
13,type_aux_mode_case=randomforest; selection_met...,0.88691,0.050814,0.954425,0.015065
6,type_aux_mode=randomforest; selection_method_c...,0.874202,0.032825,0.954425,0.015065


New performance: 0.975
Optimum case new: type_aux_mode=linear; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_2.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_24.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_4.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00273/Example_WearableComputing_weight_lifting_exercises_biceps_curl_variations.csv...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uc

In [70]:
import os
import sys
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import statistics


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


def aux_func(data, target_variable, only_normal=False, type_aux_mod="linear", selection_method=None, 
             frac_feature_imp_normal=0.1, new_cols_corr_thr=0.2, seed=123):
    
    #################################
    # PREPROCESSING
    data = data.drop_duplicates()
                            
    # Handling missing values (drop rows with missing values for simplicity)
    data.dropna(inplace=True)
    
    # Encoding categorical variables using one-hot encoding (OHE).
    # First convert the target into categorical. This facilitates later
    # calculations
    data[target_variable] = data[target_variable].astype("category")
    # Now apply OHE to categorical columns (including the target)
    data = pd.get_dummies(data)

    # After OHE there will be two columns created for the target (since it was binary).
    # Choose one of those columns as target and discard the other one. It is not important
    # which of the two is selected as target, since the F1-score performance is measured
    # using 'macro' average option
    aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
    target_variable = aux_names_target[0]
    data = data.drop(columns=[aux_names_target[1]])
                            
    # Normalizing numerical variables. Since previously OHE
    # was performed, all variables with more than 2 different
    # values will be numerical
    numerical_columns = []
    non_numerical_columns = []
    for aux_num in data.columns:
        unique_values_count = data[aux_num].nunique()
        if (unique_values_count > 2):
            numerical_columns.append(aux_num)
        else:
            non_numerical_columns.append(aux_num)
    # Normalization
    if (len(numerical_columns) != 0):
        scaler = StandardScaler()
        aux_data = scaler.fit_transform(data[numerical_columns])
        aux_data = pd.DataFrame(aux_data, columns=numerical_columns)
        aux_data = aux_data.reset_index(drop=True)
        data_non_numerical = data[non_numerical_columns].reset_index(drop=True)
        data = pd.concat([aux_data, data_non_numerical], axis=1)
                                                                                

    # Shuffle the DataFrame to randomize the rows
    data = data.sample(frac=1, random_state=seed)  
                            
    # Save some registers for testing performance:
    data_test = data.sample(frac=0.15, random_state=42)
    data = data.drop(data_test.index)

    # Calculate class weights
    class_counts = data[target_variable].value_counts()
    total_samples = class_counts.sum()
    class_weights = {cls: total_samples / count for cls, count in class_counts.items()}

    # Define custom scorer
    average_metric_type = 'macro'
    custom_scorer = make_scorer(f1_score, average=average_metric_type)

    #################################
    # NORMAL MODEL

    # If the function only returns the normal performance or the selection
    # method of new variables is feature_imp_normal
    if ((only_normal) | (selection_method is not None)):

        X_normal = data.drop(columns=[target_variable])
        y_normal = data[target_variable] 

        # Define RandomForestClassifier
        rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
        # Define the parameter grid for grid search
        min_samples_split_grid_search = [5, 10, 15]
        min_samples_leaf_grid_search = [5, 10, 15]
        param_grid = {
            'min_samples_split': min_samples_split_grid_search,
            'min_samples_leaf' : min_samples_leaf_grid_search
        }
        
        # Perform grid search        
        grid_search = GridSearchCV(estimator=rf_normal, param_grid=param_grid,
                                   scoring=custom_scorer, cv=5, n_jobs=-1)
        # The normal model takes imbalance into account to generate 
        # maximum normal performance
        grid_search.fit(X_normal, y_normal, sample_weight=y_normal.map(class_weights))
        
        # Get the best normal model
        rf_normal = grid_search.best_estimator_

        # If the call to the function was to just calculate the
        # normal performance
        if (only_normal):
            features_test_normal = data_test.drop(target_variable, axis=1)
            target_test_normal = data_test[target_variable]
            predictions_normal = rf_normal.predict(features_test_normal)
            normal_score = f1_score(target_test_normal, predictions_normal, average=average_metric_type)
                           
            return (normal_score)
    
    
    #######################################
        
                                                
    #################################
    # GENERATION OF AUXILIARY MODELS
        
    # List of dictionaries
    list_of_dictionaries = []
                            
    # For each value of the target
    for target_value in sorted(list(data[target_variable].unique())):
                                
        # Generate auxiliary dataset
        dataset_aux = data[data[target_variable] == target_value]
                                
        # Discard target in auxiliary dataset
        dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
        # Generate dictionary of ficticious targets and the models that predict them:
        dictionary_aux = {}
                                
        for fict_target in dataset_aux.columns.tolist():
        
            # Train auxiliary model and save it
            X = dataset_aux.drop(columns=[fict_target])
            y = dataset_aux[fict_target]                                 

            if (type_aux_mod == "linear"):
                aux_model = LinearRegression(n_jobs=-1)
            else:
                if (type_aux_mod == "randomforest"):
                    aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5, n_jobs=-1)
                else:
                    print("Error: current allowed type for auxiliary models is 'linear' or 'randomforest'")
                    sys.exit()
            
            aux_model.fit(X, y)
                                        
            dictionary_aux[fict_target] = aux_model
                                        
                                    
        list_of_dictionaries.append(dictionary_aux)    
            

    #################################
    # GENERATION OF NEW COLUMNS BASED ON
    # AUXILIARY MODELS
        
    list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
    list_of_rows_dataframe_new = []
                          
    new_columns = []

    # For each value of the target
    for case in range(0, len(list_unique_values_target)):
                              
        dictionary_case = list_of_dictionaries[case]
                                    
        for fict_target in dictionary_case:
                                    
            X = data.drop(columns=[target_variable, fict_target])
            y_predicted = dictionary_case[fict_target].predict(X)
            y_real = data[fict_target]  
                            
            mse = (y_real - y_predicted) ** 2
                            
            rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                          
            # Add column to list of new columns
            new_columns.append(rmse)

        
    dataframe_new = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            names_cols_dataframe_new.append(u + "_" + str(case))
    dataframe_new.columns = names_cols_dataframe_new

    cols_a = data.columns.to_list()
    cols_b = dataframe_new.columns.to_list()
                            
    data = data.reset_index(drop=True)
    dataframe_new = dataframe_new.reset_index(drop=True)
                           
    # Concatenate horizontally
    result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
    result_df.columns = cols_b + cols_a    
                            
    #################################
    # SELECTION OF NEW COLUMNS CREATED TO CREATE
    # AUGMENTED DF.

    if (selection_method == "feature_imp_normal"):
        # Calculations for later selection of new columns
        feature_importances = rf_normal.feature_importances_
        # Create a list of (feature_name, importance) tuples
        feature_importance_tuples = zip(X_normal.columns, feature_importances)
        # Sort the tuples based on importance
        sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
        # Extract sorted feature names
        sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
        # Select some features
        base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names) * frac_feature_imp_normal)]
        
        selected_columns = [] 
        for aux_base_cols in base_selected_columns:
            selected_columns.append(aux_base_cols + "_0")
            selected_columns.append(aux_base_cols + "_1")
        
    else:          
        if (selection_method == "correlation_target"):
            # Obtain new variables that have at least certain absolute correlation with the target
            correlation = result_df.corr()[target_variable].abs()
            # Select columns that have a at least certain absolute correlation with the target
            selected_columns = correlation[correlation >= new_cols_corr_thr].index
            selected_columns = [w for w in selected_columns if (w in dataframe_new.columns)]
            
        else:
            if (selection_method is None):
                # Select all created new columns
                selected_columns = dataframe_new.columns.tolist()            
                
            else:
                print("Error. The parameter selection_method must be 'feature_imp_normal', 'correlation_target' or None")
                sys.exit()
                    

    # The augmented dataframe will contain the original columns plus the 
    # selected new columns
    result_df = result_df.loc[:, data.columns.tolist() + selected_columns]

                            

    #################################
    # TRAINING OF MODEL BASED ON AUGMENTED DF.

    features = result_df.drop(target_variable, axis=1)
    target = result_df[target_variable]
    
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
    # Define the parameter grid for grid search
    min_samples_split_grid_search = [5, 10, 15]
    min_samples_leaf_grid_search = [5, 10, 15]
    param_grid = {
        'min_samples_split': min_samples_split_grid_search,
        'min_samples_leaf' : min_samples_leaf_grid_search
    }
        
    # Perform grid search
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                              scoring=custom_scorer, cv=5, n_jobs=-1)
    
    grid_search.fit(features, target, sample_weight=target.map(class_weights))
  
    # grid_search.fit(features, target)
    
    # Get the best model and corresponding score
    rf_model = grid_search.best_estimator_
                            

    #################################
    # PROCESSING OF TEST DATASET SO THAT THE TRAINED MODEL
    # CAN BE APPLIED TO IT, AND IN THIS WAY OBTAIN 
    # PERFORMANCE METRICS
    
    # For each value of the target
    new_columns = []
    for case in range(0, len(list_unique_values_target)):
                                
        dictionary_case = list_of_dictionaries[case]
        
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                X = data_test.drop(columns=[target_variable, fict_target])
                y_predicted = dictionary_case[fict_target].predict(X)
                y_real = data_test[fict_target]  
                               
                mse = (y_real - y_predicted) ** 2
                                
                rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                
                # Add column to list of new columns
                new_columns.append(rmse)
                            
    dataframe_new2 = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new2 = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):    
                names_cols_dataframe_new2.append(fict_target + "_" + str(case))
    dataframe_new2.columns = names_cols_dataframe_new2
                            
    cols_a = data_test.columns.to_list()
    cols_b = dataframe_new2.columns.to_list()
                            
    data_test = data_test.reset_index(drop=True)
    dataframe_new2 = dataframe_new2.reset_index(drop=True)
                                                    
    # Concatenate horizontally
    data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
    data_test_processed.columns = cols_b + cols_a
    # Adapt order of columns to the order in which the model was
    # trained
    data_test_processed = data_test_processed[result_df.columns.to_list()]
                         
    # Now apply trained model on test dataset to gauge performance
    features_test = data_test_processed.drop(target_variable, axis=1)
    target_test = data_test_processed[target_variable]
    predictions = rf_model.predict(features_test)
                           
    # Calculate F1 score
    f1 = f1_score(target_test, predictions, average=average_metric_type)
    return (f1)






##########################################################################################################################################

# TEST ON DATAFRAMES FROM UCI, DOWNLOAD ADAPTED FROM Perales-González, Carlos, (2020). UCI download-process, v1.3, GitHub repository, https://github.com/cperales/uci-download-process

# Number of statistical repetitions
num_stat_rep = 10

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]

for i in files_classification:
    try:
        
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            try:
                data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                data.columns = aux_names_columns
                    
                # If there will be enough dimensionality after one hot encoding
                data_check = pd.get_dummies(data)
                if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                    # Name of the target of the dataset (target_index - 1 since 
                    # in python first position is 0)
                    target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                    # If it is a binary classification
                    unique_values_count = data[target_variable].nunique()
                    if (unique_values_count == 2):
                        
                        normal_performance = []
                        for c in range(0, num_stat_rep):
                            normal_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                           only_normal=True, seed=c))

                        normal_performance_stdev = statistics.stdev(normal_performance)
                        normal_performance = statistics.mean(normal_performance)
                        print("Normal performance: " + str(round(normal_performance, 4)) + " +- " + str(round(normal_performance_stdev, 4)))
                        
                        cases_new = ["type_aux_mode=linear; selection_method_case=None"]
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2")

                        cases_new_performances = []
                        cases_new_performances_stdev = []
                        
                        
                        for case in cases_new:
                            
                            if ("linear" in case):
                                model_aux = "linear"
                            else:
                                model_aux = "randomforest"

                            if ("None" in case):
                                new_performance = []
                                for c in range(0, num_stat_rep):                            
                                    new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                   type_aux_mod=model_aux, selection_method=None,
                                                                    seed=c))
                                new_performance_stdev = statistics.stdev(new_performance)    
                                cases_new_performances_stdev.append(new_performance_stdev)
                                new_performance = statistics.mean(new_performance)        
                                cases_new_performances.append(new_performance)
                                
                            else:     
                                if ("feature_imp_normal" in case):
                                    frac_feature_imp_normal_value = float(case.split("frac_feature_imp_normal=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                      type_aux_mod=model_aux, selection_method="feature_imp_normal", 
                                                                      frac_feature_imp_normal=frac_feature_imp_normal_value,
                                                                      seed=c))
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                else:
                                    new_cols_corr_thr_value = float(case.split("new_cols_corr_thr=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                     type_aux_mod=model_aux, selection_method="correlation_target", 
                                                                     new_cols_corr_thr=new_cols_corr_thr_value, 
                                                                     seed=c))
                                        
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                    
                            print(case)
                            print(str(round(new_performance, 4)) + " +- " + str(round(new_performance_stdev, 4)))
                            

                        # Save result
                        result = pd.DataFrame({"Case": cases_new, "Mean F1-score": cases_new_performances, "Stdev F1-score": cases_new_performances_stdev})
                        result["Normal Mean F1-score"] = normal_performance
                        result["Normal stdev F1-score"] = normal_performance_stdev
                        result = result.sort_values(by="Mean F1-score", ascending=False)
                        data_name = data_url.split("/")
                        data_name = data_name[len(data_name) - 1]
                        display(result)
                        result.to_csv("metrics_" + data_name + ".csv", index=False)
                        
                        # Get maximum performance of the new cases
                        if (max(cases_new_performances) > normal_performance):
                            
                            print("New performance: " + str(round(max(cases_new_performances), 3)))
                            # Print details
                            max_index = cases_new_performances.index(max(cases_new_performances))    
                            print("Optimum case new: " + str(cases_new[max_index]))
                        else:
                            print("No improvement found")                        
                                                        
            
            except:   
                print("Dataset " + i + " could not be processed.")
                    
            
    except FileNotFoundError:  
        print(f"The file '{i}' does not exist.")




Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.7824 +- 0.0037
type_aux_mode=linear; selection_method_case=None
0.7866 +- 0.0035
type_aux_mode=randomforest; selection_method_case=None
0.7883 +- 0.0052
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.7912 +- 0.0038
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.7935 +- 0.0037
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.7922 +- 0.0039
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7
0.7883 +- 0.0042
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.7818 +- 0.0056
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.7884 +- 0.0042
type_

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
3,type_aux_mode=linear; selection_method_case=fe...,0.793526,0.003749,0.782361,0.00365
4,type_aux_mode=linear; selection_method_case=fe...,0.792178,0.00388,0.782361,0.00365
2,type_aux_mode=linear; selection_method_case=fe...,0.791216,0.003782,0.782361,0.00365
10,type_aux_mode_case=linear; selection_method_ca...,0.791197,0.003904,0.782361,0.00365
8,type_aux_mode=randomforest; selection_method_c...,0.789346,0.006401,0.782361,0.00365
9,type_aux_mode=randomforest; selection_method_c...,0.788834,0.006221,0.782361,0.00365
7,type_aux_mode=randomforest; selection_method_c...,0.788439,0.00424,0.782361,0.00365
1,type_aux_mode=randomforest; selection_method_c...,0.788326,0.005174,0.782361,0.00365
5,type_aux_mode=linear; selection_method_case=fe...,0.788308,0.004182,0.782361,0.00365
13,type_aux_mode_case=randomforest; selection_met...,0.786908,0.005545,0.782361,0.00365


New performance: 0.794
Optimum case new: type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small+adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://a

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
1,type_aux_mode=randomforest; selection_method_c...,0.998486,0.002438,0.991955,0.007678
12,type_aux_mode_case=randomforest; selection_met...,0.998486,0.002438,0.991955,0.007678
13,type_aux_mode_case=randomforest; selection_met...,0.998486,0.002438,0.991955,0.007678
8,type_aux_mode=randomforest; selection_method_c...,0.997974,0.003563,0.991955,0.007678
9,type_aux_mode=randomforest; selection_method_c...,0.997974,0.003563,0.991955,0.007678
3,type_aux_mode=linear; selection_method_case=fe...,0.997002,0.004204,0.991955,0.007678
4,type_aux_mode=linear; selection_method_case=fe...,0.997002,0.003482,0.991955,0.007678
5,type_aux_mode=linear; selection_method_case=fe...,0.997002,0.003482,0.991955,0.007678
11,type_aux_mode_case=linear; selection_method_ca...,0.996007,0.005134,0.991955,0.007678
0,type_aux_mode=linear; selection_method_case=None,0.995506,0.004942,0.991955,0.007678


New performance: 0.998
Optimum case new: type_aux_mode=randomforest; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.6788 +- 0.0315
type_aux_mode=linear; selection_method_case=None
0.6637 +- 0.038
type_aux_mode=randomforest; selection_method_case=None
0.6577 +- 0.0368
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.6788 +- 0.0315
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.6502 +- 0.0235
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.656 +- 0.0333
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7
0.656 +- 0.0333
type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.6788 +- 0.0315
type_aux_m

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
2,type_aux_mode=linear; selection_method_case=fe...,0.678828,0.031536,0.678828,0.031536
6,type_aux_mode=randomforest; selection_method_c...,0.678828,0.031536,0.678828,0.031536
11,type_aux_mode_case=linear; selection_method_ca...,0.670778,0.029694,0.678828,0.031536
13,type_aux_mode_case=randomforest; selection_met...,0.669411,0.034604,0.678828,0.031536
12,type_aux_mode_case=randomforest; selection_met...,0.666182,0.039478,0.678828,0.031536
0,type_aux_mode=linear; selection_method_case=None,0.663735,0.037954,0.678828,0.031536
7,type_aux_mode=randomforest; selection_method_c...,0.65802,0.03099,0.678828,0.031536
1,type_aux_mode=randomforest; selection_method_c...,0.657706,0.036821,0.678828,0.031536
4,type_aux_mode=linear; selection_method_case=fe...,0.655998,0.033284,0.678828,0.031536
5,type_aux_mode=linear; selection_method_case=fe...,0.655998,0.033284,0.678828,0.031536


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.963 +- 0.0171
type_aux_mode=linear; selection_method_case=None
0.9601 +- 0.0097
type_aux_mode=randomforest; selection_method_case=None
0.9652 +- 0.0134
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.9577 +- 0.0175
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.9538 +- 0.0137
type_aux_mode=linear; selection_method_case=feature_imp_nor

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
12,type_aux_mode_case=randomforest; selection_met...,0.967443,0.011724,0.962994,0.017077
13,type_aux_mode_case=randomforest; selection_met...,0.967313,0.010685,0.962994,0.017077
1,type_aux_mode=randomforest; selection_method_c...,0.965214,0.01339,0.962994,0.017077
9,type_aux_mode=randomforest; selection_method_c...,0.96423,0.010468,0.962994,0.017077
8,type_aux_mode=randomforest; selection_method_c...,0.963414,0.012624,0.962994,0.017077
0,type_aux_mode=linear; selection_method_case=None,0.960057,0.009726,0.962994,0.017077
5,type_aux_mode=linear; selection_method_case=fe...,0.959013,0.01564,0.962994,0.017077
7,type_aux_mode=randomforest; selection_method_c...,0.958974,0.011717,0.962994,0.017077
11,type_aux_mode_case=linear; selection_method_ca...,0.958922,0.016295,0.962994,0.017077
4,type_aux_mode=linear; selection_method_case=fe...,0.958016,0.012479,0.962994,0.017077


New performance: 0.967
Optimum case new: type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.9818 +- 0.0085
type_aux_mode=linear; selection_method_case=None
0.9734 +- 0.0095
type_aux_mode=randomforest; selection_method_case=None
0.9776 +- 0.0076
type_aux_mode=lin

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
12,type_aux_mode_case=randomforest; selection_met...,0.982826,0.009577,0.981764,0.00846
10,type_aux_mode_case=linear; selection_method_ca...,0.980083,0.008085,0.981764,0.00846
11,type_aux_mode_case=linear; selection_method_ca...,0.979884,0.008809,0.981764,0.00846
13,type_aux_mode_case=randomforest; selection_met...,0.979884,0.008809,0.981764,0.00846
2,type_aux_mode=linear; selection_method_case=fe...,0.97967,0.007625,0.981764,0.00846
6,type_aux_mode=randomforest; selection_method_c...,0.979663,0.0086,0.981764,0.00846
8,type_aux_mode=randomforest; selection_method_c...,0.979479,0.009619,0.981764,0.00846
3,type_aux_mode=linear; selection_method_case=fe...,0.979453,0.009651,0.981764,0.00846
9,type_aux_mode=randomforest; selection_method_c...,0.978423,0.009629,0.981764,0.00846
1,type_aux_mode=randomforest; selection_method_c...,0.977594,0.007597,0.981764,0.00846


New performance: 0.983
Optimum case new: type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00252/pop_failures.dat...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00233/CNAE-9.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/vowel/vowel-context.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data...
File downloaded successfully to dataset_file_au

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
2,type_aux_mode=linear; selection_method_case=fe...,0.999712,0.000504,0.999711,0.000374
11,type_aux_mode_case=linear; selection_method_ca...,0.999712,0.000504,0.999711,0.000374
7,type_aux_mode=randomforest; selection_method_c...,0.99971,0.000375,0.999711,0.000374
6,type_aux_mode=randomforest; selection_method_c...,0.999639,0.00051,0.999711,0.000374
0,type_aux_mode=linear; selection_method_case=None,0.999639,0.00051,0.999711,0.000374
3,type_aux_mode=linear; selection_method_case=fe...,0.999639,0.00051,0.999711,0.000374
4,type_aux_mode=linear; selection_method_case=fe...,0.999639,0.00051,0.999711,0.000374
10,type_aux_mode_case=linear; selection_method_ca...,0.999639,0.00051,0.999711,0.000374
8,type_aux_mode=randomforest; selection_method_c...,0.999638,0.000381,0.999711,0.000374
9,type_aux_mode=randomforest; selection_method_c...,0.999638,0.000381,0.999711,0.000374


New performance: 1.0
Optimum case new: type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00244/fertility_Diagnosis.txt...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.6405 +- 0.1033
type_aux_mode=linear; selection_method_case=None
0.5946 +- 0.0921
type_aux_mode=randomforest; selection_method_case=None
0.564 +- 0.0827
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.6405 +- 0.1033
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.6405 +- 0.1033

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
2,type_aux_mode=linear; selection_method_case=fe...,0.640499,0.103315,0.640499,0.103315
3,type_aux_mode=linear; selection_method_case=fe...,0.640499,0.103315,0.640499,0.103315
6,type_aux_mode=randomforest; selection_method_c...,0.640499,0.103315,0.640499,0.103315
7,type_aux_mode=randomforest; selection_method_c...,0.640499,0.103315,0.640499,0.103315
4,type_aux_mode=linear; selection_method_case=fe...,0.622187,0.07641,0.640499,0.103315
11,type_aux_mode_case=linear; selection_method_ca...,0.611968,0.090081,0.640499,0.103315
9,type_aux_mode=randomforest; selection_method_c...,0.609807,0.090067,0.640499,0.103315
5,type_aux_mode=linear; selection_method_case=fe...,0.606248,0.107144,0.640499,0.103315
10,type_aux_mode_case=linear; selection_method_ca...,0.604426,0.094838,0.640499,0.103315
8,type_aux_mode=randomforest; selection_method_c...,0.600713,0.083187,0.640499,0.103315


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hayes-roth/hayes-roth.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_with_noise_Tra

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
8,type_aux_mode=randomforest; selection_method_c...,0.646962,0.066621,0.629257,0.052268
9,type_aux_mode=randomforest; selection_method_c...,0.642871,0.060931,0.629257,0.052268
13,type_aux_mode_case=randomforest; selection_met...,0.642863,0.048428,0.629257,0.052268
2,type_aux_mode=linear; selection_method_case=fe...,0.637387,0.045766,0.629257,0.052268
11,type_aux_mode_case=linear; selection_method_ca...,0.63692,0.033235,0.629257,0.052268
1,type_aux_mode=randomforest; selection_method_c...,0.635747,0.063573,0.629257,0.052268
6,type_aux_mode=randomforest; selection_method_c...,0.633027,0.054444,0.629257,0.052268
3,type_aux_mode=linear; selection_method_case=fe...,0.630739,0.038734,0.629257,0.052268
12,type_aux_mode_case=randomforest; selection_met...,0.627336,0.064794,0.629257,0.052268
10,type_aux_mode_case=linear; selection_method_ca...,0.623136,0.04341,0.629257,0.052268


New performance: 0.647
Optimum case new: type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ic

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
3,type_aux_mode=linear; selection_method_case=fe...,0.859373,0.004271,0.856379,0.007044
11,type_aux_mode_case=linear; selection_method_ca...,0.858553,0.006389,0.856379,0.007044
0,type_aux_mode=linear; selection_method_case=None,0.858237,0.004771,0.856379,0.007044
4,type_aux_mode=linear; selection_method_case=fe...,0.858218,0.004685,0.856379,0.007044
5,type_aux_mode=linear; selection_method_case=fe...,0.85786,0.00428,0.856379,0.007044
10,type_aux_mode_case=linear; selection_method_ca...,0.857667,0.006181,0.856379,0.007044
2,type_aux_mode=linear; selection_method_case=fe...,0.8544,0.005265,0.856379,0.007044
9,type_aux_mode=randomforest; selection_method_c...,0.831642,0.004885,0.856379,0.007044
8,type_aux_mode=randomforest; selection_method_c...,0.831295,0.005443,0.856379,0.007044
1,type_aux_mode=randomforest; selection_method_c...,0.827835,0.005529,0.856379,0.007044


New performance: 0.859
Optimum case new: type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-1.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-2.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/monks-problems/monks-3.test...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 1.0 +-

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,1.0,0.0,1.0,0.0
1,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
2,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
3,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
4,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
5,type_aux_mode=linear; selection_method_case=fe...,1.0,0.0,1.0,0.0
6,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
7,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
8,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0
9,type_aux_mode=randomforest; selection_method_c...,1.0,0.0,1.0,0.0


No improvement found
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/eighthr.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/ozone/onehr.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00230/plrx.txt...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archiv

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,0.997383,0.000561,0.99653,0.000692
10,type_aux_mode_case=linear; selection_method_ca...,0.997383,0.000561,0.99653,0.000692
11,type_aux_mode_case=linear; selection_method_ca...,0.996988,0.000428,0.99653,0.000692
5,type_aux_mode=linear; selection_method_case=fe...,0.996735,0.000685,0.99653,0.000692
2,type_aux_mode=linear; selection_method_case=fe...,0.99653,0.000692,0.99653,0.000692
3,type_aux_mode=linear; selection_method_case=fe...,0.99653,0.000692,0.99653,0.000692
6,type_aux_mode=randomforest; selection_method_c...,0.99653,0.000692,0.99653,0.000692
7,type_aux_mode=randomforest; selection_method_c...,0.99653,0.000692,0.99653,0.000692
4,type_aux_mode=linear; selection_method_case=fe...,0.996105,0.000792,0.99653,0.000692
1,type_aux_mode=randomforest; selection_method_c...,0.99428,0.001002,0.99653,0.000692


New performance: 0.997
Optimum case new: type_aux_mode=linear; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data...
File downloaded successfully to dataset_file_aux.txt.
Normal performance: 0.9367 +- 0.0106
type_aux_mode=linear; selection_method_case=None
0.9332 +- 0.0097
type_aux_mode=randomforest; selection_method_case=None
0.9415 +- 0.0096
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1
0.9333 +- 0.0084
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3
0.9342 +- 0.0083
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
0.9363 +- 0.0083
type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_norma

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
8,type_aux_mode=randomforest; selection_method_c...,0.942947,0.009657,0.936735,0.010633
9,type_aux_mode=randomforest; selection_method_c...,0.941818,0.009228,0.936735,0.010633
1,type_aux_mode=randomforest; selection_method_c...,0.941532,0.009578,0.936735,0.010633
7,type_aux_mode=randomforest; selection_method_c...,0.941421,0.009894,0.936735,0.010633
12,type_aux_mode_case=randomforest; selection_met...,0.94013,0.009407,0.936735,0.010633
4,type_aux_mode=linear; selection_method_case=fe...,0.936303,0.008347,0.936735,0.010633
13,type_aux_mode_case=randomforest; selection_met...,0.935945,0.010094,0.936735,0.010633
10,type_aux_mode_case=linear; selection_method_ca...,0.935644,0.010571,0.936735,0.010633
3,type_aux_mode=linear; selection_method_case=fe...,0.934215,0.008308,0.936735,0.010633
5,type_aux_mode=linear; selection_method_case=fe...,0.933953,0.008031,0.936735,0.010633


New performance: 0.943
Optimum case new: type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.train...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z...
File downloaded successfully to dataset_file_aux.txt.
Dataset C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datas

Unnamed: 0,Case,Mean F1-score,Stdev F1-score,Normal Mean F1-score,Normal stdev F1-score
0,type_aux_mode=linear; selection_method_case=None,0.975289,0.008928,0.954425,0.015065
2,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
3,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
4,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
5,type_aux_mode=linear; selection_method_case=fe...,0.975289,0.008928,0.954425,0.015065
10,type_aux_mode_case=linear; selection_method_ca...,0.975289,0.008928,0.954425,0.015065
11,type_aux_mode_case=linear; selection_method_ca...,0.975289,0.008928,0.954425,0.015065
12,type_aux_mode_case=randomforest; selection_met...,0.954425,0.015065,0.954425,0.015065
13,type_aux_mode_case=randomforest; selection_met...,0.954425,0.015065,0.954425,0.015065
6,type_aux_mode=randomforest; selection_method_c...,0.952975,0.013196,0.954425,0.015065


New performance: 0.975
Optimum case new: type_aux_mode=linear; selection_method_case=None
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_2.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_24.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00194/sensor_readings_4.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/00273/Example_WearableComputing_weight_lifting_exercises_biceps_curl_variations.csv...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uc

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

# Sample DataFrame
data = {
    'feature1': [1, 2, 3, 4],
    'feature2': [0.5, 1.5, 2.5, 3.5],
    'feature3': [10, 20, 30, 40],
    'feature4': [5, 15, 25, 35],
    'feature5': [2, 1, 2, 3],
    'feature6': [1, 4, 1, 6],
    'target': [0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Define class weights for target variable
class_weights = {0: 1, 1: 3}  # Example class weights (adjust as needed)

# Define columns to exclude from class weights
columns_to_exclude = ['feature3', 'feature4', 'feature5', 'feature6']

# Custom transformer to apply target class weights to specific columns
class ColumnWeightTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, class_weights, columns_to_exclude):
        self.class_weights = class_weights
        self.columns_to_exclude = columns_to_exclude
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Apply class weights to columns not in the specified list
        X_copy = X.copy()
        for col in X.columns:
            if col not in self.columns_to_exclude:
                X_copy[col] *= self.class_weights.get(col, 1)  # Multiply by class weight or 1 if not specified
        return X_copy

# Define a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('apply_class_weights', ColumnWeightTransformer(class_weights, columns_to_exclude), ['feature1', 'feature2']),  # Example numerical columns
        ('other_columns', 'passthrough', ['feature3', 'feature4', 'feature5', 'feature6'])  # Treat these columns normally
    ]
)

# Define RandomForestClassifier pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Apply class weights here
])

# Separate features and target
X = df.drop(columns='target')
y = df['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForestClassifier pipeline
rf_pipeline.fit(X_train, y_train)


In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# Generate sample data
np.random.seed(0)
n_samples = 1000
n_features = 6
X = np.random.rand(n_samples, n_features)
y = np.random.randint(2, size=n_samples)

# Add imbalance to the target variable
y[np.random.choice(np.where(y == 0)[0], size=n_samples // 5, replace=False)] = 1

# Create a DataFrame
data = {'feature__' + str(i+1): X[:, i] for i in range(n_features)}
data['target'] = y
df = pd.DataFrame(data)

# Define class weights for target variable
class_weights = {0: 1, 1: 3}  # Example class weights (adjust as needed)

# Define columns to exclude from class weights
columns_to_exclude_from_weights = ['feature__4']

# Custom transformer to apply target class weights to specific columns
class ColumnWeightTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, class_weights, columns_to_exclude):
        self.class_weights = class_weights
        self.columns_to_exclude = columns_to_exclude
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Apply class weights to specified columns
        X_copy = X.copy()
        for col in X.columns:
            if col not in self.columns_to_exclude:
                X_copy[col] *= X[col].map(self.class_weights)  # Multiply by class weight or 1 if not specified
                
        return X_copy

# Define a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('apply_class_weights', ColumnWeightTransformer(class_weights, columns_to_exclude_from_weights), slice(None)), # With slice(None) all columns are passed, but internally only those not present in columns_to_exclude_from_weights will be treated
        ('other_columns', 'passthrough', columns_to_exclude_from_weights)  # Treat these columns normally
    ]
)

# Define RandomForestClassifier pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Apply class weights here
])

# Separate features and target
X = df.drop(columns='target')
y = df['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 10, 20]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, cv=5, n_jobs=-1)

# Perform GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_rf_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_rf_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Unnamed: 0,feature__1,feature__2,feature__3,feature__4,feature__5,feature__6,target
0,0.548814,0.715189,0.602763,0.544883,0.423655,0.645894,0
1,0.437587,0.891773,0.963663,0.383442,0.791725,0.528895,1
2,0.568045,0.925597,0.071036,0.087129,0.020218,0.832620,0
3,0.778157,0.870012,0.978618,0.799159,0.461479,0.780529,1
4,0.118274,0.639921,0.143353,0.944669,0.521848,0.414662,1
...,...,...,...,...,...,...,...
995,0.147905,0.529429,0.959006,0.182730,0.259130,0.120583,0
996,0.594602,0.926769,0.238015,0.472421,0.907615,0.708283,1
997,0.528287,0.639307,0.730840,0.728478,0.484977,0.281476,0
998,0.206175,0.918614,0.058247,0.337073,0.690050,0.600069,1


Test Accuracy: 0.66


In [90]:
import os
import sys
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import statistics
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")




def aux_func(data, target_variable, only_normal=False, type_aux_mod="linear", selection_method=None, 
             frac_feature_imp_normal=0.1, new_cols_corr_thr=0.2, seed=123):

    # Custom transformer to apply target class weights to specific columns
    class ColumnWeightTransformer(BaseEstimator, TransformerMixin):
        def __init__(self, class_weights, columns_to_exclude_from_weights):
            self.class_weights = class_weights
            self.columns_to_exclude_from_weights = columns_to_exclude_from_weights        
            # print(columns_to_exclude_from_weights)
        
        def fit(self, result_df, target=None):
            return self
        
        def transform(self, result_df):
            # Apply class weights to specified columns        
            result_df_copy = result_df.copy()

            for col in result_df.columns:
                if col not in self.columns_to_exclude_from_weights:
                    if (col != target_variable):
                        result_df_copy[col] *= result_df[target_variable].map(self.class_weights)
                    
            result_df_copy = result_df_copy.drop(columns=[target_variable])        
            return result_df_copy
        
    #################################
    # PREPROCESSING
    data = data.drop_duplicates()
                            
    # Handling missing values (drop rows with missing values for simplicity)
    data.dropna(inplace=True)
    
    # Encoding categorical variables using one-hot encoding (OHE).
    # First convert the target into categorical. This facilitates later
    # calculations
    data[target_variable] = data[target_variable].astype("category")
    # Now apply OHE to categorical columns (including the target)
    data = pd.get_dummies(data)

    # After OHE there will be two columns created for the target (since it was binary).
    # Choose one of those columns as target and discard the other one. It is not important
    # which of the two is selected as target, since the F1-score performance is measured
    # using 'macro' average option
    aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
    target_variable = aux_names_target[0]
    data = data.drop(columns=[aux_names_target[1]])
                            
    # Normalizing numerical variables. Since previously OHE
    # was performed, all variables with more than 2 different
    # values will be numerical
    numerical_columns = []
    non_numerical_columns = []
    for aux_num in data.columns:
        unique_values_count = data[aux_num].nunique()
        if (unique_values_count > 2):
            numerical_columns.append(aux_num)
        else:
            non_numerical_columns.append(aux_num)
    # Normalization
    if (len(numerical_columns) != 0):
        scaler = StandardScaler()
        aux_data = scaler.fit_transform(data[numerical_columns])
        aux_data = pd.DataFrame(aux_data, columns=numerical_columns)
        aux_data = aux_data.reset_index(drop=True)
        data_non_numerical = data[non_numerical_columns].reset_index(drop=True)
        data = pd.concat([aux_data, data_non_numerical], axis=1)
                                                                                

    # Shuffle the DataFrame to randomize the rows
    data = data.sample(frac=1, random_state=seed)  
                            
    # Save some registers for testing performance:
    data_test = data.sample(frac=0.15, random_state=42)
    data = data.drop(data_test.index)

    # Calculate class weights
    class_counts = data[target_variable].value_counts()
    total_samples = class_counts.sum()
    class_weights = {cls: total_samples / count for cls, count in class_counts.items()}

    # Define custom scorer
    average_metric_type = 'macro'
    custom_scorer = make_scorer(f1_score, average=average_metric_type)

    #################################
    # NORMAL MODEL

    # If the function only returns the normal performance or the selection
    # method of new variables is feature_imp_normal
    if ((only_normal) | (selection_method is not None)):

        X_normal = data.drop(columns=[target_variable])
        y_normal = data[target_variable] 

        # Define RandomForestClassifier
        rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        
        # Define the parameter grid for grid search
        min_samples_split_grid_search = [5, 10, 15]
        min_samples_leaf_grid_search = [5, 10, 15]
        param_grid = {
            'min_samples_split': min_samples_split_grid_search,
            'min_samples_leaf' : min_samples_leaf_grid_search
        }

        
        # Perform grid search        
        grid_search = GridSearchCV(estimator=rf_normal, param_grid=param_grid,
                                   scoring=custom_scorer, cv=5, n_jobs=-1)
        # The normal model takes imbalance into account to generate 
        # maximum normal performance
        grid_search.fit(X_normal, y_normal, sample_weight=y_normal.map(class_weights))
        
        # Get the best normal model
        rf_normal = grid_search.best_estimator_

        # If the call to the function was to just calculate the
        # normal performance
        if (only_normal):
            features_test_normal = data_test.drop(target_variable, axis=1)
            target_test_normal = data_test[target_variable]
            predictions_normal = rf_normal.predict(features_test_normal)
            normal_score = f1_score(target_test_normal, predictions_normal, average=average_metric_type)
                           
            return (normal_score)
    
    
    #######################################
        
                                                
    #################################
    # GENERATION OF AUXILIARY MODELS
        
    # List of dictionaries
    list_of_dictionaries = []
                            
    # For each value of the target
    for target_value in sorted(list(data[target_variable].unique())):
                                
        # Generate auxiliary dataset
        dataset_aux = data[data[target_variable] == target_value]
                                
        # Discard target in auxiliary dataset
        dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
        # Generate dictionary of ficticious targets and the models that predict them:
        dictionary_aux = {}
                                
        for fict_target in dataset_aux.columns.tolist():
        
            # Train auxiliary model and save it
            X = dataset_aux.drop(columns=[fict_target])
            y = dataset_aux[fict_target]                                 

            if (type_aux_mod == "linear"):
                aux_model = LinearRegression(n_jobs=-1)
            else:
                if (type_aux_mod == "randomforest"):
                    aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5, n_jobs=-1)
                else:
                    print("Error: current allowed type for auxiliary models is 'linear' or 'randomforest'")
                    sys.exit()
            
            aux_model.fit(X, y)
                                        
            dictionary_aux[fict_target] = aux_model
                                        
                                    
        list_of_dictionaries.append(dictionary_aux)    
            

    #################################
    # GENERATION OF NEW COLUMNS BASED ON
    # AUXILIARY MODELS
        
    list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
    list_of_rows_dataframe_new = []
                          
    new_columns = []

    # For each value of the target
    for case in range(0, len(list_unique_values_target)):
                              
        dictionary_case = list_of_dictionaries[case]
                                    
        for fict_target in dictionary_case:
                                    
            X = data.drop(columns=[target_variable, fict_target])
            y_predicted = dictionary_case[fict_target].predict(X)
            y_real = data[fict_target]  
                            
            mse = (y_real - y_predicted) ** 2
                            
            rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                          
            # Add column to list of new columns
            new_columns.append(rmse)

        
    dataframe_new = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            names_cols_dataframe_new.append(u + "_" + str(case))
    dataframe_new.columns = names_cols_dataframe_new

    cols_a = data.columns.to_list()
    cols_b = dataframe_new.columns.to_list()
                            
    data = data.reset_index(drop=True)
    dataframe_new = dataframe_new.reset_index(drop=True)
                           
    # Concatenate horizontally
    result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
    result_df.columns = cols_b + cols_a    
                            
    #################################
    # SELECTION OF NEW COLUMNS CREATED TO CREATE
    # AUGMENTED DF.

    if (selection_method == "feature_imp_normal"):
        # Calculations for later selection of new columns
        feature_importances = rf_normal.feature_importances_
        # Create a list of (feature_name, importance) tuples
        feature_importance_tuples = zip(X_normal.columns, feature_importances)
        # Sort the tuples based on importance
        sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
        # Extract sorted feature names
        sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
        # Select some features
        base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names) * frac_feature_imp_normal)]
        
        selected_columns = [] 
        for aux_base_cols in base_selected_columns:
            selected_columns.append(aux_base_cols + "_0")
            selected_columns.append(aux_base_cols + "_1")
        
    else:          
        if (selection_method == "correlation_target"):
            # Obtain new variables that have at least certain absolute correlation with the target
            correlation = result_df.corr()[target_variable].abs()
            # Select columns that have a at least certain absolute correlation with the target
            selected_columns = correlation[correlation >= new_cols_corr_thr].index
            selected_columns = [w for w in selected_columns if (w in dataframe_new.columns)]
            
        else:
            if (selection_method is None):
                # Select all created new columns
                selected_columns = dataframe_new.columns.tolist()            
                
            else:
                print("Error. The parameter selection_method must be 'feature_imp_normal', 'correlation_target' or None")
                sys.exit()
                    

    # The augmented dataframe will contain the original columns plus the 
    # selected new columns
    result_df = result_df.loc[:, data.columns.tolist() + selected_columns]

                            

    #################################
    # TRAINING OF MODEL BASED ON AUGMENTED DF.

    features = result_df.drop(target_variable, axis=1)
    target = result_df[target_variable]

    columns_to_exclude_from_weights = [s for s in features.columns if (s in dataframe_new.columns.tolist())] 
    # Define a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
        ('apply_class_weights', ColumnWeightTransformer(class_weights, columns_to_exclude_from_weights), slice(None)), # With slice(None) all columns are passed, but internally only those not present in columns_to_exclude_from_weights will be treated
        ('other_columns', 'passthrough', columns_to_exclude_from_weights)  # Treat these columns normally
        ]
        )
    # Define RandomForestClassifier pipeline
    rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier()) 
    ])
    # Implement in GridSearchCV
    min_samples_split_grid_search = [5, 10, 15]
    min_samples_leaf_grid_search = [5, 10, 15]
    param_grid = {
        'classifier__n_estimators': [100],
        'classifier__min_samples_split': min_samples_split_grid_search,
        'classifier__min_samples_leaf': min_samples_leaf_grid_search
    }
    grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, 
                               scoring=custom_scorer, cv=5, n_jobs=-1)
    
    
    # Target imbalance is only handled for the columns that are not new, i.e those
    # of the original dataframe.
    # grid_search needs to receive the dataset with all columns (including target),
    # ATTENION: but it receives the target only so that ColumnWeightTransformer can handle weights
    # of the original columns, ColumnWeightTransformer after doing so drops
    # the target variable so that it does not take into account in training.
    # If you are not convinced, you can create an artificial attribute column that is equal
    # to the target, and you shall see that the performance is much better than the one
    # that is actually obtained here.
    grid_search.fit(result_df, target)
  
    # grid_search.fit(features, target)
    
    # Get the best model and corresponding score
    rf_model = grid_search.best_estimator_
                            

    #################################
    # PROCESSING OF TEST DATASET SO THAT THE TRAINED MODEL
    # CAN BE APPLIED TO IT, AND IN THIS WAY OBTAIN 
    # PERFORMANCE METRICS
    
    # For each value of the target
    new_columns = []
    for case in range(0, len(list_unique_values_target)):
                                
        dictionary_case = list_of_dictionaries[case]
        
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                X = data_test.drop(columns=[target_variable, fict_target])
                y_predicted = dictionary_case[fict_target].predict(X)
                y_real = data_test[fict_target]  
                               
                mse = (y_real - y_predicted) ** 2
                                
                rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                
                # Add column to list of new columns
                new_columns.append(rmse)
                            
    dataframe_new2 = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new2 = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):    
                names_cols_dataframe_new2.append(fict_target + "_" + str(case))
    dataframe_new2.columns = names_cols_dataframe_new2
                            
    cols_a = data_test.columns.to_list()
    cols_b = dataframe_new2.columns.to_list()
                            
    data_test = data_test.reset_index(drop=True)
    dataframe_new2 = dataframe_new2.reset_index(drop=True)
                                                    
    # Concatenate horizontally
    data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
    data_test_processed.columns = cols_b + cols_a
    # Adapt order of columns to the order in which the model was
    # trained
    data_test_processed = data_test_processed[result_df.columns.to_list()]
                         
    # Now apply trained model on test dataset to gauge performance
    features_test = data_test_processed.drop(target_variable, axis=1)
    target_test = data_test_processed[target_variable]
    # predictions = rf_model.predict(features_test)
    # rf_model needs to receive the dataset with all columns (including target),
    # ATTENION: but it receives the target only so that ColumnWeightTransformer can handle weights
    # of the original columns, ColumnWeightTransformer after doing so drops
    # the target variable so that it does not take into account in training.
    # If you are not convinced, you can create an artificial attribute column that is equal
    # to the target, and you shall see that the performance is much better than the one
    # that is actually obtained here.
    predictions = rf_model.predict(data_test_processed)
                           
    # Calculate F1 score
    f1 = f1_score(target_test, predictions, average=average_metric_type)
    return (f1)






##########################################################################################################################################

# TEST ON DATAFRAMES FROM UCI, DOWNLOAD ADAPTED FROM Perales-González, Carlos, (2020). UCI download-process, v1.3, GitHub repository, https://github.com/cperales/uci-download-process

# Number of statistical repetitions
num_stat_rep = 5

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]

for i in files_classification:
    # try:
    if "haber" in i:
        
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                data.columns = aux_names_columns
                    
                # If there will be enough dimensionality after one hot encoding
                data_check = pd.get_dummies(data)
                if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                    # Name of the target of the dataset (target_index - 1 since 
                    # in python first position is 0)
                    target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                    # If it is a binary classification
                    unique_values_count = data[target_variable].nunique()
                    if (unique_values_count == 2):
                        
                        normal_performance = []
                        for c in range(0, num_stat_rep):
                            normal_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                           only_normal=True, seed=c))

                        normal_performance_stdev = statistics.stdev(normal_performance)
                        normal_performance = statistics.mean(normal_performance)
                        print("Normal performance: " + str(round(normal_performance, 4)) + " +- " + str(round(normal_performance_stdev, 4)))
                        
                        cases_new = ["type_aux_mode=linear; selection_method_case=None"]
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2")

                        cases_new_performances = []
                        cases_new_performances_stdev = []
                        
                        
                        for case in cases_new:
                            
                            if ("linear" in case):
                                model_aux = "linear"
                            else:
                                model_aux = "randomforest"

                            if ("None" in case):
                                new_performance = []
                                for c in range(0, num_stat_rep):                            
                                    new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                   type_aux_mod=model_aux, selection_method=None,
                                                                    seed=c))
                                new_performance_stdev = statistics.stdev(new_performance)    
                                cases_new_performances_stdev.append(new_performance_stdev)
                                new_performance = statistics.mean(new_performance)        
                                cases_new_performances.append(new_performance)
                                
                            else:     
                                if ("feature_imp_normal" in case):
                                    frac_feature_imp_normal_value = float(case.split("frac_feature_imp_normal=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                      type_aux_mod=model_aux, selection_method="feature_imp_normal", 
                                                                      frac_feature_imp_normal=frac_feature_imp_normal_value,
                                                                      seed=c))
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                else:
                                    new_cols_corr_thr_value = float(case.split("new_cols_corr_thr=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                     type_aux_mod=model_aux, selection_method="correlation_target", 
                                                                     new_cols_corr_thr=new_cols_corr_thr_value, 
                                                                     seed=c))
                                        
                                    new_performance_stdev = statistics.stdev(new_performance)
                                    cases_new_performances_stdev.append(new_performance_stdev)    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                    
                            print(case)
                            print(str(round(new_performance, 4)) + " +- " + str(round(new_performance_stdev, 4)))
                            

                        # Save result
                        result = pd.DataFrame({"Case": cases_new, "Mean F1-score": cases_new_performances, "Stdev F1-score": cases_new_performances_stdev})
                        result["Normal Mean F1-score"] = normal_performance
                        result["Normal stdev F1-score"] = normal_performance_stdev
                        result = result.sort_values(by="Mean F1-score", ascending=False)
                        data_name = data_url.split("/")
                        data_name = data_name[len(data_name) - 1]
                        display(result)
                        result.to_csv("metrics_" + data_name + ".csv", index=False)
                        
                        # Get maximum performance of the new cases
                        if (max(cases_new_performances) > normal_performance):
                            
                            print("New performance: " + str(round(max(cases_new_performances), 3)))
                            # Print details
                            max_index = cases_new_performances.index(max(cases_new_performances))    
                            print("Optimum case new: " + str(cases_new[max_index]))
                        else:
                            print("No improvement found")                        
                                                        
            
            # except:   
            else:    
                print("Dataset " + i + " could not be processed.")
                    
            
    # except FileNotFoundError:  
    else:
        print(f"The file '{i}' does not exist.")




The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\adult\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\balance-scale\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\balloons-a\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\balloons-b\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\balloons-c\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classification\balloons-d\config.ini' does not exist.
The file 'C:\Users\fjlobo\Desktop\estudio_mod\descarga_de_datasets_de_uci\directorio3\datafiles\classi

KeyboardInterrupt: 