# Apply forward feature selection for different targets

## Yield

In [None]:

from concurrent.futures import ProcessPoolExecutor
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import pickle
from multiprocessing import cpu_count
import pandas as pd
from sklearn.model_selection import LeavePOut
import numpy as np
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score




def load_data(feature_file, target_file,target_name):
    '''Load feature matrix and targets'''
    X = pd.read_csv(feature_file)
    y = pd.read_csv(target_file)[target_name]
    return X, y

def train_random_forest(X_train, y_train, seed):
    '''Train a random forest model with selected seed'''
    dt_classifier = RandomForestClassifier(random_state=seed)
    dt_classifier.fit(X_train, y_train)
    return dt_classifier


def evaluate_feature(args):
    '''
    Evaluate the selected features by mean value of LOO with 9 random seeds.
    Args:
        feature: list, the candidate feature to be added to the selected feature matrix.
        selected_features: str, the set of features that have already been selected.
        X: DataFrame, the whole feature matrix.
        y: column vector of DataFrame, classification targets of samples.
    Returns:
        feature: str, the newly added feature
        mean_accuracy: float, the accuracy when the new feature is added to the selected feature matrtix.
    '''
    feature, selected_features, X, y = args
    current_features = selected_features + [feature]
    X_current = X[current_features]

    # Train the model with Leave-One-Out CV
    loo = LeaveOneOut()
    accuracies = []

    for train_index, test_index in loo.split(X_current):
        X_train, X_test = X_current.iloc[train_index], X_current.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        for seed in [20,30,42,50,60,80,100,200,500]:
            dt_classifier = train_random_forest(X_train, y_train, seed=seed)  # Use a fixed seed for reproducibility
            y_pred = dt_classifier.predict(X_test)
            current_accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(current_accuracy)

    mean_accuracy = np.mean(accuracies)
    return feature, mean_accuracy

#     return test_accuracy

def train_and_evaluate_model(X_train, y_train, X_test, y_test):
    '''Calculate test accuracy.'''
    dt_classifier = RandomForestClassifier(random_state=42)
    dt_classifier.fit(X_train, y_train)

    # Predict on the test set
    y_pred = dt_classifier.predict(X_test)

    # Evaluate the performance
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def run(X, y,output_folder,num_folds):
    '''
    Use samplings from leave-p-out to apply uniform sample on the dataset. Then, run forward stepwie feature selection.
    Args:
        X: DataFrame, the whole feature matrix.
        y: column vector of DataFrame, classification targets of samples.
        output_folder: str, the directory to save the outputs.
        num_folds: int, the number of different dataset splits to evaluate.
    Returns:
        result: dict, {(tuple of selected features): accuracy,}
   '''
    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        lpo = LeavePOut(4)
        temperate_split = []
        for train_index, test_index in lpo.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            if np.sum(y_test) == 2:
                lst = [train_index,test_index]
                temperate_split.append(lst)
        
        samples = np.linspace(start=0,stop=len(temperate_split)-1,num=num_folds)
        
        result = []
        for iteration,sample in enumerate(samples,start=1):
            iteration_folder = os.path.join(output_folder,str(iteration))
            os.makedirs(iteration_folder, exist_ok=True)
            train_index, test_index = temperate_split[int(sample)]
            print(f"Iteration {iteration}: Points left out - {test_index}")
            selected_features = []
            # Use train_index and test_index to split your data
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            instant_result = {}
            while len(selected_features) < 10:
                # Evaluate mean performance for each remaining feature using multiprocessing
                remaining_features = set(X_train.columns) - set(selected_features)
                performance_dict = {}

                # Create a list of (feature, mean_accuracy) pairs
                feature_args = [(feature, selected_features, X_train, y_train) for feature in remaining_features]
                futures = list(executor.map(evaluate_feature, feature_args))
                with open(os.path.join(iteration_folder, '{}_selected.pkl'.format(len(selected_features))), 'wb') as file:
                    pickle.dump(futures, file)
                # Retrieve the results
                for feature, mean_accuracy in futures:
                    performance_dict[feature] = mean_accuracy

                # Select the best-performing feature and add it to the selected features
                best_feature = max(performance_dict, key=performance_dict.get)
                best_performance = performance_dict[best_feature]
                selected_features.append(best_feature)
                print(f"Best Feature: {best_feature}")
                print(f"Best Performance: {best_performance}")
                print("Selected Features: ", selected_features)
                print("=" * 100)
                
                

                # Train and evaluate the model using the selected features
                accuracy = train_and_evaluate_model(X_train[selected_features], y_train, X_test[selected_features], y_test)
                print(f"Test Set Accuracy with Selected Features: {accuracy}")
                print("=" * 100)
                instant_result[tuple(selected_features)] = (best_performance, accuracy)
                    
            result.append(instant_result)
    return result

In [None]:
feature_file = "../agent/data/del_and_exp_features.csv"
target_file = "../agent/data/all_targets.csv"
target_name = 'yield_cla'
X, y = load_data(feature_file, target_file,target_name)           
output_folder = "./output/yield_cla"
num_folds = 50

os.makedirs(output_folder, exist_ok=True)

result = run(X, y,output_folder,num_folds)
with open(os.path.join(output_folder, 'result.pkl'), 'wb') as file:
    pickle.dump(result, file)

Iteration 1: Points left out - [ 0  1 10 11]
Best Feature: loading(Fe)
Best Performance: 0.75
Selected Features:  ['loading(Fe)']
Test Set Accuracy with Selected Features: 0.5
Best Feature: frag_level_2_TPSA_Mean
Best Performance: 0.8993055555555556
Selected Features:  ['loading(Fe)', 'frag_level_2_TPSA_Mean']
Test Set Accuracy with Selected Features: 0.5
Best Feature: MaxPartialCharge
Best Performance: 0.9236111111111112
Selected Features:  ['loading(Fe)', 'frag_level_2_TPSA_Mean', 'MaxPartialCharge']
Test Set Accuracy with Selected Features: 0.25
Best Feature: NumAliphaticRings
Best Performance: 0.9201388888888888
Selected Features:  ['loading(Fe)', 'frag_level_2_TPSA_Mean', 'MaxPartialCharge', 'NumAliphaticRings']
Test Set Accuracy with Selected Features: 0.5
Best Feature: Atom_PEOE-Charge_Mean
Best Performance: 0.9270833333333334
Selected Features:  ['loading(Fe)', 'frag_level_2_TPSA_Mean', 'MaxPartialCharge', 'NumAliphaticRings', 'Atom_PEOE-Charge_Mean']
Test Set Accuracy with Sel

## Fe/Hf

In [None]:
feature_file = "../agent/data/del_features.csv"
target_file = "../agent/data/all_targets.csv"
target_name = 'Fe/Hf_cla'
X, y = load_data(feature_file, target_file,target_name)           
output_folder = "./output/Fe-Hf_cla"
num_folds = 50

os.makedirs(output_folder, exist_ok=True)

result = run(X, y,output_folder,num_folds)
with open(os.path.join(output_folder, 'result.pkl'), 'wb') as file:
    pickle.dump(result, file)

Iteration 1: Points left out - [0 1 2 4]
Best Feature: NumHeteroatoms
Best Performance: 0.75
Selected Features:  ['NumHeteroatoms']
Test Set Accuracy with Selected Features: 0.5
Best Feature: Atom_LogP_Max
Best Performance: 0.78125
Selected Features:  ['NumHeteroatoms', 'Atom_LogP_Max']
Test Set Accuracy with Selected Features: 0.75
Best Feature: frag_level_2_LabuteASA_Mean
Best Performance: 0.7986111111111112
Selected Features:  ['NumHeteroatoms', 'Atom_LogP_Max', 'frag_level_2_LabuteASA_Mean']
Test Set Accuracy with Selected Features: 0.5
Best Feature: NumHDonors
Best Performance: 0.7986111111111112
Selected Features:  ['NumHeteroatoms', 'Atom_LogP_Max', 'frag_level_2_LabuteASA_Mean', 'NumHDonors']
Test Set Accuracy with Selected Features: 0.5
Best Feature: Atom_MR_Max
Best Performance: 0.8194444444444444
Selected Features:  ['NumHeteroatoms', 'Atom_LogP_Max', 'frag_level_2_LabuteASA_Mean', 'NumHDonors', 'Atom_MR_Max']
Test Set Accuracy with Selected Features: 0.5
Best Feature: NumHA