In [1]:
# Usando modelos auxiliares lineales

import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a
                            
                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data_test.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data_test[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.66268062 0.91267896]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [1]:
# Usando modelos auxiliares de RF

import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")



# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a
                            
                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data_test.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data_test[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.64616886 0.90597847]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [None]:
# Se puede observar que dependiendo de si se usan modelos auxiliares lineales o de RF
# se obtienen unos resultados u otros --> Para cada dataset habrá que ver cuál es la mejor
# opción

In [None]:
# Pruebas añadiendo como nuevas columnas solo aquellas que estén
# por encima de un cierto umbral en su correlación con la target.
# Por lo menos estableciendo un threshold del 20%, en la mayoría
# de los casos empeora, pero en algunos se obtiene una mejora
# significativa --> Para cada dataset habrá que ver cuál es la mejor
# opción, o quizá buscar otra forma de probar con distintas nuevas
# columnas

# Para todas estas pruebas además habrá que realizar varias repeticiones
# estadísticas para obtener datos más fiables

In [15]:
# Usando modelos auxiliares lineales y threshold de 20%
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            selected_columns = selected_columns.tolist()
                            print(selected_columns)
                            print(len(selected_columns))
                            print(result_df.shape)
                            
                            result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
[]
0
(27656, 325)
F1 Score: [0.67607004 0.9135514 ]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded s

In [16]:
# Usando modelos auxiliares de RF y threshold de 20%
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new
                            
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            selected_columns = selected_columns.tolist()
                            print(selected_columns)
                            print(len(selected_columns))
                            print(result_df.shape)
                            
                            result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
['39_0', ' 13_1', ' Bachelors_ 12th_1', ' Bachelors_ Assoc-voc_0', ' Bachelors_ Preschool_0', ' Not-in-family_ Other-relative_1']
6
(27656, 325)
F1 Score: [0.6651439 0.9031832]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Do

In [None]:
# Pruebas realizando un feature importance de un modelo generado solo con las
# variables nuevas, y extrayendo el 10% de las variables nuevas con mayor 
# feature importance para añadir al dataset. Esta estrategia por lo general
# parece poco efectiva, a diferencia de la selección por correlación con la
# target, donde a veces (en el caso de los modelos lineales) sí se apreciaba
# una ventaja.

In [20]:
# Usando modelos auxiliares lineales
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            cols_b = dataframe_new.columns.to_list()
                            
                            
                            # Concatenate horizontally
                            aux_df = pd.concat([dataframe_new, data[[target_variable]]], axis=1, ignore_index=True)
                            aux_df.columns = cols_b + [target_variable]
                            
                            features = aux_df.drop(target_variable, axis=1)
                            target = aux_df[target_variable]
                            rf_model_aux = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split=10, min_samples_leaf=5)
                            rf_model_aux.fit(features, target)
                            
                            feature_importances = rf_model_aux.feature_importances_
                            
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(features.columns, feature_importances)
                                                        
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]

                            # Select some features
                            selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.52893519 0.89865538]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [18]:
# Usando modelos auxiliares de RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Threshold of correlation with target for new columns
new_cols_threshold = 0.2

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]   
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       if (len(lines[x].split(" = ")) == 1):
                           pass
                       # If the separator is specified    
                       else:
                           separator = lines[x].split(" = ")[1]
                           separator = separator.strip()
                           if ("comma" in separator):
                               separator = ","
                               
                   else:
                       if ('target_index' in lines[x]):    
                           target_index = lines[x].split(" = ")[1]
                       else:
                           if ('header' in lines[x]):    
                               # If there is not header specified, assume
                               # default value
                               if (len(lines[x].split(" = ")) == 1):
                                    pass
                               else:
                                   # If it is specified
                                   header = lines[x].split(" = ")[1]
                                   # If it is 0 assign it to None
                                   if (header.startswith("0")):
                                       header = None
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python')
                    print(data.shape)
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = data.columns.tolist()[int(target_index) - 1]
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [n for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')
                            
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            cols_b = dataframe_new.columns.to_list()
                            
                            
                            # Concatenate horizontally
                            aux_df = pd.concat([dataframe_new, data[[target_variable]]], axis=1, ignore_index=True)
                            aux_df.columns = cols_b + [target_variable]
                            
                            features = aux_df.drop(target_variable, axis=1)
                            target = aux_df[target_variable]
                            rf_model_aux = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split=10, min_samples_leaf=5)
                            rf_model_aux.fit(features, target)
                            
                            feature_importances = rf_model_aux.feature_importances_
                            
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(features.columns, feature_importances)
                                                        
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]

                            # Select some features
                            selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to read the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
...
File downloaded successfully to dataset_file_aux.txt.
(32560, 15)
F1 Score normal: [0.67607004 0.9135514 ]
0
#########################################
1
#########################################
F1 Score: [0.58933195 0.89870992]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data
...
File downloaded successfully to dataset_file_aux.txt.
(624, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data
...
File downloaded successfully to dataset_file_aux.txt.
(19, 5)
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data
...
File downloaded successfully to dat

In [None]:
# Selección de nuevas columnas en base a la importancia de las variables
# del modelo normal. En concreto, se toman como base el 10% de las variables
# con más importancia en el modelo normal y luego se seleccionan para cada
# una de ellas la variable nueva terminada en "_0" y la terminada en "_1".

# Al menos usando modelos auxiliares lineales, se observa algunos casos donde
# mejora, como en el breast cancer dataset

In [2]:
# Usando modelos auxiliares lineales
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        # aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.64977192 0.91128515]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [3]:
# Usando modelos auxiliares RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:
                                        aux_model = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        # aux_model = LinearRegression()
                                        aux_model.fit(X_train, y_train)
                                        dictionary_aux[fict_target] = aux_model
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared = 1
                                        else:    
                                            r_squared = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    
                                        
                                        dictionary_aux_r_squared[fict_target] = r_squared
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    if ((u + '_' + str(case)) in selected_columns):    
                                        names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.67910751 0.89159929]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [None]:
# Original but choosing for each auxiliary model
# the best performance (linear vs RF). It seems
# that this giving in general poor results

In [6]:
# Usando modelos auxiliares RF
import os
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]
for i in files_classification:
    try:
        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                try:
                    data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                    aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                    data.columns = aux_names_columns
                    
                    # If there will be enough dimensionality after one hot encoding
                    data_check = pd.get_dummies(data)
                    if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                        # Name of the target of the dataset (target_index - 1 since 
                        # in python first position is 0)
                        target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                        # If it is a binary classification
                        unique_values_count = data[target_variable].nunique()
                        if (unique_values_count == 2):
            
                            # -------------------------------------------------------------------------------------
                            data = data.drop_duplicates()
                            
                            # Handling missing values (drop rows with missing values for simplicity)
                            data.dropna(inplace=True)
              
                            
                            # Encoding categorical variables using one-hot encoding (OHE)
                            data = pd.get_dummies(data)
        
                            # Obtain new name of the target (after OHE) and discard the other option
                            aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
                            # If the target was subjected to OHE
                            if (len(aux_names_target) != 0):
                                # Obtain new name of the target variable
                                target_variable = aux_names_target[0]
                                # Discard other columns with other values of the target variable, since
                                # including them would artificially yield high performance
                                for g in range(1, len(aux_names_target)):    
                                    data = data.drop(columns=[aux_names_target[g]])
                            
                            
                            # Normalizing variables:
                            data_columns = data.columns
                            scaler = StandardScaler()
                            data = scaler.fit_transform(data)
                            data = pd.DataFrame(data, columns=data_columns)
                            
                            
                            # Denormalize target values (these must be 0 or 1):
                            def aux_denormalize_target(aux):
                                threshold = min(list(data[target_variable].unique()))
                                if aux > threshold:
                                    return 1
                                else:
                                    return 0
                            
                            data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
                            
                            # Shuffle the DataFrame to randomize the rows
                            data = data.sample(frac=1, random_state=12)  
                            
                            
                            # Save some registers for testing performance:
                            data_test = data.sample(frac=0.15, random_state=42)
                            data = data.drop(data_test.index)
                            
                            #############################
                            # Normal performance
                            X_train = data.drop(columns=[target_variable])
                            y_train = data[target_variable] 
                            X_test = data_test.drop(columns=[target_variable])
                            y_test = data_test[target_variable]
                            
                            rf_normal = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_normal.fit(X_train, y_train)
                            
                            predictions_normal = rf_normal.predict(X_test)
                            
                            # Calculate F1 score for each class
                            f1_normal = f1_score(y_test, predictions_normal, average=None)
                            print(f'F1 Score normal: {f1_normal}')

                            #######################################
                            # Calculations for later selection of new columns
                            feature_importances = rf_normal.feature_importances_
                            # Create a list of (feature_name, importance) tuples
                            feature_importance_tuples = zip(X_train.columns, feature_importances)
                            # Sort the tuples based on importance
                            sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
                            # Extract sorted feature names
                            sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
                            # Select some features
                            base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names)/10)]
                            #######################################
                            
                            
                            #############################
                            
                            
                            
                            # List of dictionaries
                            list_of_dictionaries = []
                            
                            # List of dictionaries of R²
                            list_of_dictionaries_r_squared = []
                            
                            # For each value of the target
                            for target_value in sorted(list(data[target_variable].unique())):
                                print(target_value)
                                print("#########################################")
                            
                                # Generate auxiliary dataset
                                dataset_aux = data[data[target_variable] == target_value]
                                
                                # Discard target in auxiliary dataset
                                dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
                                # Generate dictionary of ficticious targets and the models that predict them:
                                dictionary_aux = {}
                                # Correspondant dictionary of rmse for weighing 
                                dictionary_aux_r_squared = {}
                                
                                for fict_target in dataset_aux.columns.tolist():
                                    # print(fict_target)
                                    
                                    # Train auxiliary model and save it
                                    X = dataset_aux.drop(columns=[fict_target])
                                    y = dataset_aux[fict_target] 
                                    
                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                            
                                    
                                    # Fit the auxiliary model:
                                    if True:

                                        # First try RF
                                        aux_model_RF = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                                        aux_model_RF.fit(X_train, y_train)
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model_RF.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared_RF = 1
                                        else:    
                                            r_squared_RF = 1 - (rss / tss)    

                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    

  
                                        # Now try linear regression
                                        aux_model_linear = LinearRegression()
                                        aux_model_linear.fit(X_train, y_train)
                                        
                                        
                                        #####
                                        # Computation of R²* for weighing:
                                        # (*)Actually, it is a variation of R² so that the values are
                                        # in the range [0, 1] negative R² values will be converted to 0,
                                        # so it is not really R²
                                        predictions = aux_model_linear.predict(X_test)
                                        y_mean = np.mean(y_test)
                                        # Calculate the total sum of squares
                                        tss = np.sum((y_test - y_mean) ** 2)
                                        # Calculate the residual sum of squares
                                        rss = np.sum((y_test - predictions) ** 2)
                                        # Calculate R² score
                                        # If tss == 0 then R² will be 1
                                        if (tss < 0.00001) & (tss > -0.00001):
                                            r_squared_linear  = 1
                                        else:    
                                            r_squared_linear = 1 - (rss / tss)    
                                        # Apply modification
                                        # if (r_squared < 0):
                                        #    r_squared = 0
                                        # print(r_squared)    

                                        if (r_squared_RF > r_squared_linear):
                                            dictionary_aux[fict_target] = aux_model_RF
                                            dictionary_aux_r_squared[fict_target] = r_squared_RF
                                        else:
                                            dictionary_aux[fict_target] = aux_model_linear
                                            dictionary_aux_r_squared[fict_target] = r_squared_linear
                                    
                                list_of_dictionaries.append(dictionary_aux)    
                                list_of_dictionaries_r_squared.append(dictionary_aux_r_squared)    
                            
                            list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
                            
                            list_of_rows_dataframe_new = []
                            
                            
                            
                            
                            new_columns = []
                            
                            # For each value of the target
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    X = data.drop(columns=[target_variable, fict_target])
                                    y_predicted = dictionary_case[fict_target].predict(X)
                                    y_real = data[fict_target]  
                            
                                    mse = (y_real - y_predicted) ** 2
                            
                                    rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                    weighing_value = dictionary_case_r_squared[fict_target]
                                    # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]
                            
                                    # Add column to list of new columns
                                    new_columns.append(rmse)
                            
                            dataframe_new = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new = []
                            for u in dictionary_case.keys():
                                    names_cols_dataframe_new.append(u + "_0")
                                    names_cols_dataframe_new.append(u + "_1")        
                            dataframe_new.columns = names_cols_dataframe_new

                            #####################
                            ####################
                            
                            # Select some features
                            selected_columns = [] 
                            for aux_base_cols in base_selected_columns:
                                selected_columns.append(aux_base_cols + "_0")
                                selected_columns.append(aux_base_cols + "_1")
                            
                            ####################
                            ####################

                            # dataframe_new = dataframe_new.loc[:, selected_columns]
                            
                            cols_a = data.columns.to_list()
                            cols_b = dataframe_new.columns.to_list()
                            
                            data = data.reset_index(drop=True)
                            dataframe_new = dataframe_new.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
                            result_df.columns = cols_b + cols_a

                            #####
                            # Calculate correlation with the target variable
                            # correlation = result_df.corr()[target_variable].abs()
                            # Select columns whose names end with '_0' or '_1' and have a at least certain absolute correlation with the target
                            # selected_columns = correlation[(correlation.index.str.endswith('_0') | correlation.index.str.endswith('_1')) & (correlation >= new_cols_threshold)].index
                            # selected_columns = selected_columns.tolist()
                            # print(selected_columns)
                            # print(len(selected_columns))
                            # print(result_df.shape)
                            
                            # result_df = result_df[data.columns.tolist() + selected_columns]
                            #####

                            
                            
                            # The process has generated additional columns in the dataframe (those ending with _0 or _1).
                            # These additional columns could enhance potentially performance.
                            # The whole cycle may be repeated again (sort of a new layer) generating more additional
                            # variables (these will contain also those now ending with _0_0, _0_1, _1_0, and 1_1).
                            
                            
                            # Train model
                            features = result_df.drop(target_variable, axis=1)
                            target = result_df[target_variable]
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_split=5, min_samples_leaf=5)
                            rf_model.fit(features, target)
                            
                            
                            #######################################################################
                            # Now process the test dataset so that the model can be applied to it
                            # For each value of the target
                            new_columns = []
                            for case in range(0, len(list_unique_values_target)):
                                
                                dictionary_case = list_of_dictionaries[case]
                                dictionary_case_r_squared = list_of_dictionaries_r_squared[case]
                                    
                                for fict_target in dictionary_case:
                                    
                                    # if ((fict_target + '_' + str(case)) in selected_columns):
                                    if True:
                                    
                                        X = data_test.drop(columns=[target_variable, fict_target])
                                        y_predicted = dictionary_case[fict_target].predict(X)
                                        y_real = data_test[fict_target]  
                                
                                        mse = (y_real - y_predicted) ** 2
                                
                                        rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                        weighing_value = dictionary_case_r_squared[fict_target]
                                        # rmse = [float(rmse_value * (weighing_value**2)) for rmse_value in rmse]

                                        rmse = [1e5 if (x > 1e5) else x for x in rmse]
                                
                                        # Add column to list of new columns
                                        new_columns.append(rmse)
                            
                            dataframe_new2 = pd.DataFrame(new_columns).transpose()
                            names_cols_dataframe_new2 = []
                            for case in range(0, len(list_unique_values_target)):
                                for u in dictionary_case.keys():
                                    # if ((u + '_' + str(case)) in selected_columns):    
                                    names_cols_dataframe_new2.append(u + "_" + str(case))
                            dataframe_new2.columns = names_cols_dataframe_new2
                            
                            cols_a = data_test.columns.to_list()
                            cols_b = dataframe_new2.columns.to_list()
                            
                            data_test = data_test.reset_index(drop=True)
                            dataframe_new2 = dataframe_new2.reset_index(drop=True)
                            
                            
                            # Concatenate horizontally
                            data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
                            data_test_processed.columns = cols_b + cols_a
                            # Adapt order of columns to the order in which the model was
                            # trained
                            data_test_processed = data_test_processed[result_df.columns.to_list()]
                            #######################################################################
                            
                            # Now apply trained model on test dataset to gauge performance
                            features_test = data_test_processed.drop(target_variable, axis=1)
                            target_test = data_test_processed[target_variable]
                            predictions = rf_model.predict(features_test)
                            
                            # Calculate F1 score for each class
                            f1 = f1_score(target_test, predictions, average=None)
                            print(f'F1 Score: {f1}')
                            # -------------------------------------------------------------------------------------
        
        
                except:
                    print("It was not possible to process the downloaded dataset")
    

            
            # except:
            else:
                print("Dataset " + i + " could not be processed.")
            
            
            
                      
                    
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")


Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
F1 Score normal: [0.69234427 0.91552422]
0
#########################################
1
#########################################
F1 Score: [0.60213904 0.90572732]
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult-stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/adult+stretch.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from http://archive.ics.uci.edu/ml/machine-learning-databases/balloons/yellow-small.data...
File downloaded successfully to dataset_file_aux.txt.
Downloading file from h

In [None]:
# Testing multiple hyperparameters (type of auxiliary model, selection of new variables)

In [72]:
import pandas as pd

# Example DataFrame
data = {
    'A': [0, 1, 0, 1, 0],
    'B': [1, 1, 1, 1, 0],
    'C': [1, 0, 0, 1, 0],
    'D': [0, 0, 0, 0, 1]
}

df = pd.DataFrame(data)

# Count the number of 1 values for each column
count_ones = df.sum()

# Find the name of the column with the fewest 1 values
column_with_fewest_ones = count_ones.idxmin()

print("Column with the fewest 1 values:", column_with_fewest_ones)


Column with the fewest 1 values: D


In [None]:
import os
import sys
import urllib.request
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import statistics


def return_files(directory):
    """
    Returns all files in the specified directory.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    # Get a list of all files in the directory
    files = os.listdir(directory)
    
    return (files)


def download_file(url, destination):
    """
    Download a file from the specified URL to the specified destination.
    """
    try:
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, destination)
        print(f"File downloaded successfully to {destination}.")
    except Exception as e:
        print(f"An error occurred: {e}")


def aux_func(data, target_variable, only_normal=False, type_aux_mod="linear", selection_method=None, 
             frac_feature_imp_normal=0.1, new_cols_corr_thr=0.2, seed=123):
    
    #################################
    # PREPROCESSING
    data = data.drop_duplicates()
                            
    # Handling missing values (drop rows with missing values for simplicity)
    data.dropna(inplace=True)
              
                            
    # Encoding categorical variables using one-hot encoding (OHE)
    data = pd.get_dummies(data)
        
    # Obtain new name of the target (after OHE) and discard the other option
    aux_names_target = [str(n) for n in data.columns.tolist() if (n.startswith(target_variable + "_"))]
    # If the target was subjected to OHE
    if (len(aux_names_target) != 0):
        # Obtain new name of the target variable (it will be the one with fewer 1 values)
        # Sum the 1 values for each column
        count_ones = data.loc[:, aux_names_target].sum()
        # Find the name of the column with the fewest 1 values
        target_variable = count_ones.idxmin()
        
        # Discard other columns with other values of the target variable, since
        # including them would artificially yield high performance
        for g in aux_names_target:    
            if (g != target_variable):
                data = data.drop(columns=[g])
                            
    # Normalizing variables:
    data_columns = data.columns
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    data = pd.DataFrame(data, columns=data_columns)
                            
                            
    # Denormalize target values (these must be 0 or 1):
    def aux_denormalize_target(aux):
        threshold = min(list(data[target_variable].unique()))
        if aux > threshold:
            return 1
        else:
            return 0
                            
    data[target_variable] = data[target_variable].apply(aux_denormalize_target)
                            
    # Shuffle the DataFrame to randomize the rows
    data = data.sample(frac=1, random_state=seed)  
                            
    # Save some registers for testing performance:
    data_test = data.sample(frac=0.15, random_state=42)
    data = data.drop(data_test.index)
     
    #################################
    # NORMAL MODEL

    # If the function only returns the normal performance or the selection
    # method of new variables is feature_imp_normal
    if ((only_normal) | (selection_method is not None)):

        X_normal = data.drop(columns=[target_variable])
        y_normal = data[target_variable] 

        # Define RandomForestClassifier
        rf_normal = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
        
        # Define the parameter grid for grid search
        min_samples_split_grid_search = [5, 10, 15]
        min_samples_leaf_grid_search = [5, 10, 15]
        param_grid = {
            'min_samples_split': min_samples_split_grid_search,
            'min_samples_leaf' : min_samples_leaf_grid_search
        }
        
        # Perform grid search
        grid_search = GridSearchCV(estimator=rf_normal, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)
        grid_search.fit(X_normal, y_normal)
        
        # Get the best normal model and corresponding score
        rf_normal = grid_search.best_estimator_

        # If the call to the function was to just calculate the
        # normal performance
        if (only_normal):
            features_test_normal = data_test.drop(target_variable, axis=1)
            target_test_normal = data_test[target_variable]
            predictions_normal = rf_normal.predict(features_test_normal)
            normal_score = f1_score(target_test_normal, predictions_normal)
                           
            return (normal_score)
    
    
    #######################################
        
                                                
    #################################
    # GENERATION OF AUXILIARY MODELS
        
    # List of dictionaries
    list_of_dictionaries = []
                            
    # For each value of the target
    for target_value in sorted(list(data[target_variable].unique())):
                                
        # Generate auxiliary dataset
        dataset_aux = data[data[target_variable] == target_value]
                                
        # Discard target in auxiliary dataset
        dataset_aux = dataset_aux.drop(columns=[target_variable])
                                
        # Generate dictionary of ficticious targets and the models that predict them:
        dictionary_aux = {}
                                
        for fict_target in dataset_aux.columns.tolist():
        
            # Train auxiliary model and save it
            X = dataset_aux.drop(columns=[fict_target])
            y = dataset_aux[fict_target] 
                                
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            if (type_aux_mod == "linear"):
                aux_model = LinearRegression(n_jobs=-1)
            else:
                if (type_aux_mod == "randomforest"):
                    aux_model = RandomForestRegressor(n_estimators=200, random_state=42, min_samples_split=5, min_samples_leaf=5, n_jobs=-1)
                else:
                    print("Error: current allowed type for auxiliary models is 'linear' or 'randomforest'")
                    sys.exit()
            
            aux_model.fit(X_train, y_train)
                                        
            dictionary_aux[fict_target] = aux_model
                                        
                                    
        list_of_dictionaries.append(dictionary_aux)    
            

    #################################
    # GENERATION OF NEW COLUMNS BASED ON
    # AUXILIARY MODELS
        
    list_unique_values_target = sorted(list(data[target_variable].unique()))
                                
    list_of_rows_dataframe_new = []
                          
    new_columns = []

    # For each value of the target
    for case in range(0, len(list_unique_values_target)):
                              
        dictionary_case = list_of_dictionaries[case]
                                    
        for fict_target in dictionary_case:
                                    
            X = data.drop(columns=[target_variable, fict_target])
            y_predicted = dictionary_case[fict_target].predict(X)
            y_real = data[fict_target]  
                            
            mse = (y_real - y_predicted) ** 2
                            
            rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                          
            # Add column to list of new columns
            new_columns.append(rmse)

        
    dataframe_new = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            names_cols_dataframe_new.append(u + "_" + str(case))
    dataframe_new.columns = names_cols_dataframe_new

    cols_a = data.columns.to_list()
    cols_b = dataframe_new.columns.to_list()
                            
    data = data.reset_index(drop=True)
    dataframe_new = dataframe_new.reset_index(drop=True)
                           
    # Concatenate horizontally
    result_df = pd.concat([dataframe_new, data], axis=1, ignore_index=True)
    result_df.columns = cols_b + cols_a    
                            
    #################################
    # SELECTION OF NEW COLUMNS CREATED TO CREATE
    # AUGMENTED DF.

    if (selection_method == "feature_imp_normal"):
        # Calculations for later selection of new columns
        feature_importances = rf_normal.feature_importances_
        # Create a list of (feature_name, importance) tuples
        feature_importance_tuples = zip(X_normal.columns, feature_importances)
        # Sort the tuples based on importance
        sorted_feature_importance_tuples = sorted(feature_importance_tuples, key=lambda x: x[1], reverse=True)
        # Extract sorted feature names
        sorted_feature_names = [feature_name for feature_name, _ in sorted_feature_importance_tuples]
        # Select some features
        base_selected_columns = sorted_feature_names[0: int(len(sorted_feature_names) * frac_feature_imp_normal)]
        
        selected_columns = [] 
        for aux_base_cols in base_selected_columns:
            selected_columns.append(aux_base_cols + "_0")
            selected_columns.append(aux_base_cols + "_1")
        
    else:          
        if (selection_method == "correlation_target"):
            # Obtain new variables that have at least certain absolute correlation with the target
            correlation = result_df.corr()[target_variable].abs()
            # Select columns that have a at least certain absolute correlation with the target
            selected_columns = correlation[correlation >= new_cols_corr_thr].index
            selected_columns = [w for w in selected_columns if (w in dataframe_new.columns)]
            
        else:
            if (selection_method is None):
                # Select all created new columns
                selected_columns = dataframe_new.columns.tolist()            
                
            else:
                print("Error. The parameter selection_method must be 'feature_imp_normal', 'correlation_target' or None")
                sys.exit()
                    

    # The augmented dataframe will contain the original columns plus the 
    # selected new columns
    result_df = result_df.loc[:, data.columns.tolist() + selected_columns]

                            

    #################################
    # TRAINING OF MODEL BASED ON AUGMENTED DF.

    features = result_df.drop(target_variable, axis=1)
    target = result_df[target_variable]
    
    rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
        
    # Define the parameter grid for grid search
    min_samples_split_grid_search = [5, 10, 15]
    min_samples_leaf_grid_search = [5, 10, 15]
    param_grid = {
        'min_samples_split': min_samples_split_grid_search,
        'min_samples_leaf' : min_samples_leaf_grid_search
    }
        
    # Perform grid search
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=5, n_jobs=-1)
    grid_search.fit(features, target)
        
    # Get the best model and corresponding score
    rf_model = grid_search.best_estimator_
                            

    #################################
    # PROCESSING OF TEST DATASET SO THAT THE TRAINED MODEL
    # CAN BE APPLIED TO IT, AND IN THIS WAY OBTAIN 
    # PERFORMANCE METRICS
    
    # For each value of the target
    new_columns = []
    for case in range(0, len(list_unique_values_target)):
                                
        dictionary_case = list_of_dictionaries[case]
        
        for fict_target in dictionary_case:
            if ((fict_target + '_' + str(case)) in selected_columns):
                                    
                X = data_test.drop(columns=[target_variable, fict_target])
                y_predicted = dictionary_case[fict_target].predict(X)
                y_real = data_test[fict_target]  
                               
                mse = (y_real - y_predicted) ** 2
                                
                rmse = [round(np.sqrt(row_mse_value),4) for row_mse_value in mse]
                                
                # Add column to list of new columns
                new_columns.append(rmse)
                            
    dataframe_new2 = pd.DataFrame(new_columns).transpose()
    names_cols_dataframe_new2 = []
    for case in range(0, len(list_unique_values_target)):
        dictionary_case = list_of_dictionaries[case]
        for u in dictionary_case.keys():
            if ((u + '_' + str(case)) in selected_columns):    
                names_cols_dataframe_new2.append(u + "_" + str(case))
    dataframe_new2.columns = names_cols_dataframe_new2
                            
    cols_a = data_test.columns.to_list()
    cols_b = dataframe_new2.columns.to_list()
                            
    data_test = data_test.reset_index(drop=True)
    dataframe_new2 = dataframe_new2.reset_index(drop=True)
                                                    
    # Concatenate horizontally
    data_test_processed = pd.concat([dataframe_new2, data_test], axis=1, ignore_index=True)
    data_test_processed.columns = cols_b + cols_a
    # Adapt order of columns to the order in which the model was
    # trained
    data_test_processed = data_test_processed[result_df.columns.to_list()]
                         
    # Now apply trained model on test dataset to gauge performance
    features_test = data_test_processed.drop(target_variable, axis=1)
    target_test = data_test_processed[target_variable]
    predictions = rf_model.predict(features_test)
                           
    # Calculate F1 score
    f1 = f1_score(target_test, predictions)
    return (f1)






##########################################################################################################################################

# TEST ON DATAFRAMES FROM UCI, DOWNLOAD ADAPTED FROM Perales-González, Carlos, (2020). UCI download-process, v1.3, GitHub repository, https://github.com/cperales/uci-download-process

# Number of statistical repetitions
num_stat_rep = 10

# Get the current working directory
current_directory = os.getcwd()

# Specify the directory you want to list files for
directory_path_classification = os.path.join(current_directory, 'descarga_de_datasets_de_uci', 'directorio3', 
                              'datafiles', 'classification')

# Call the function to return files in the directory
files_classification = return_files(directory_path_classification)
files_classification = [os.path.join(directory_path_classification, x, "config.ini") for x in files_classification]

for i in files_classification:
    try:

        with open(i, 'r') as file:
            lines = file.readlines()            
            len_lines = len(lines)
            data_url = ""
            separator = "\\s+"
            target_index = ""
            header_option = True
            for x in range(0, len_lines):
               if ('data_url' in lines[x]):
                   data_url = lines[x].split(" = ")[1]  
                   data_url = data_url.strip()
               else:
                   if ('separator' in lines[x]):
                       # If there is not separator specified, assume
                       # default value ("\t")
                       try:
                           if (len(lines[x].split(" = ")) == 1):
                               pass
                           # If the separator is specified    
                           else:
                               separator = lines[x].split(" = ")[1]
                               separator = separator.strip()
                               if ("comma" in separator):
                                   separator = ","
                       except:
                           pass
                               
                   else:
                       if ('target_index' in lines[x]):    
                           try:
                               target_index = lines[x].split(" = ")[1]
                               target_index = int(target_index.strip())
                           except:
                               pass
                       else:
                           if ('header' in lines[x]):    
                               try:
                                   # If there is not header specified, assume
                                   # default value
                                   if (len(lines[x].split(" = ")) == 1):
                                        pass
                                   else:
                                       # If it is specified
                                       header_option = lines[x].split(" = ")[1]
                                       header_option = int(header_option.strip())
                                       # If it is 0 assign it to None
                                       if (str(header_option).startswith("0")):
                                           header_option = None
                               except:
                                   pass
                   
                
            # Fetch plain text content from the URL
            download_file(data_url, 'dataset_file_aux.txt')
            # Read downloaded file
            # try:
            if True:
                data = pd.read_csv('dataset_file_aux.txt', sep=separator, engine='python', header=header_option)
                aux_names_columns = [str(aux_n_c) for aux_n_c in data.columns]
                data.columns = aux_names_columns
                    
                # If there will be enough dimensionality after one hot encoding
                data_check = pd.get_dummies(data)
                if (data_check.shape[0] >= (data_check.shape[1]*3*10)):
                    
                    # Name of the target of the dataset (target_index - 1 since 
                    # in python first position is 0)
                    target_variable = str(data.columns.tolist()[int(target_index) - 1])
        
                    # If it is a binary classification
                    unique_values_count = data[target_variable].nunique()
                    if (unique_values_count == 2):
                        
                        normal_performance = []
                        for c in range(0, num_stat_rep):
                            normal_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                           only_normal=True, seed=c))
                        
                        normal_performance = statistics.mean(normal_performance)
                        print("Normal performance: " + str(round(normal_performance, 3)))
                        
                        cases_new = ["type_aux_mode=linear; selection_method_case=None"]
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=None")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=linear; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.1")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.3")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.5")
                        cases_new.append("type_aux_mode=randomforest; selection_method_case=feature_imp_normal; frac_feature_imp_normal=0.7")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=linear; selection_method_case=correlation_target; new_cols_corr_thr=0.2")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.1")
                        cases_new.append("type_aux_mode_case=randomforest; selection_method_case=correlation_target; new_cols_corr_thr=0.2")

                        cases_new_performances = []
                        cases_new_performances_stdev = []
                        
                        
                        for case in cases_new:
                            
                            if ("linear" in case):
                                model_aux = "linear"
                            else:
                                model_aux = "randomforest"

                            if ("None" in case):
                                new_performance = []
                                for c in range(0, num_stat_rep):                            
                                    new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                   type_aux_mod=model_aux, selection_method=None, seed=c))
                                cases_new_performances_stdev.append(statistics.stdev(new_performance))
                                new_performance = statistics.mean(new_performance)        
                                cases_new_performances.append(new_performance)
                                
                            else:     
                                if ("feature_imp_normal" in case):
                                    frac_feature_imp_normal_value = float(case.split("frac_feature_imp_normal=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                      type_aux_mod=model_aux, selection_method="feature_imp_normal", 
                                                                      frac_feature_imp_normal=frac_feature_imp_normal_value, seed=c))
                                    cases_new_performances_stdev.append(statistics.stdev(new_performance))    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                                else:
                                    new_cols_corr_thr_value = float(case.split("new_cols_corr_thr=")[1])
                                    new_performance = []
                                    for c in range(0, num_stat_rep):         
                                        new_performance.append(aux_func(data=data.copy(), target_variable=target_variable,
                                                                     type_aux_mod=model_aux, selection_method="correlation_target", 
                                                                  new_cols_corr_thr=new_cols_corr_thr_value, seed=c))
                                    cases_new_performances_stdev.append(statistics.stdev(new_performance))    
                                    new_performance = statistics.mean(new_performance)        
                                    cases_new_performances.append(new_performance)
                            
                            print(case)
                            print(new_performance)

                        # Save result
                        result = pd.DataFrame({"Case": cases_new, "Mean F1-score": cases_new_performances, "Stdev F1-score": cases_new_performances_stdev})
                        result = result.sort_values(by="Mean F1-score", ascending=False)
                        data_name = data_url.split("/")
                        data_name = data_name[len(data_name) - 1]
                        display(result)
                        result.to_csv("metrics_" + data_name + ".csv", index=False)
                        
                        # Get maximum performance of the new cases
                        if (max(cases_new_performances) > normal_performance):
                            
                            print("New performance: " + str(round(max(cases_new_performances), 3)))
                            # Print details
                            max_index = cases_new_performances.index(max(cases_new_performances))    
                            print("Optimum case new: " + str(cases_new[max_index]))
                        else:
                            print("No improvement found")                        
                                
                        
            
            # except:
            else:    
                print("Dataset " + i + " could not be processed.")
            
            
            
    except FileNotFoundError:
        print(f"The file '{i}' does not exist.")




Downloading file from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data...
File downloaded successfully to dataset_file_aux.txt.
